1
0
mirror of https://github.com/rspeer/wordfreq.git synced 2025-01-14 05:05:59 +00:00

Merge pull request from LuminosoInsight/all-1.6-changes

All 1.6 changes
This commit is contained in:
Andrew Lin 2017-02-01 15:36:38 -05:00 committed by GitHub
commit 72e3678e89
110 changed files with 25927 additions and 26262 deletions
CHANGELOG.mdREADME.md
scripts
setup.py
tests
wordfreq
__init__.py
data
combined_ar.msgpack.gzcombined_bg.msgpack.gzcombined_ca.msgpack.gzcombined_cs.msgpack.gzcombined_da.msgpack.gzcombined_de.msgpack.gzcombined_el.msgpack.gzcombined_en.msgpack.gzcombined_es.msgpack.gzcombined_fa.msgpack.gzcombined_fi.msgpack.gzcombined_fr.msgpack.gzcombined_he.msgpack.gzcombined_hi.msgpack.gzcombined_hu.msgpack.gzcombined_id.msgpack.gzcombined_it.msgpack.gzcombined_ja.msgpack.gzcombined_ko.msgpack.gzcombined_ms.msgpack.gzcombined_nb.msgpack.gzcombined_nl.msgpack.gzcombined_pl.msgpack.gzcombined_pt.msgpack.gzcombined_ro.msgpack.gzcombined_ru.msgpack.gzcombined_sh.msgpack.gzcombined_sv.msgpack.gzcombined_tr.msgpack.gzcombined_uk.msgpack.gzcombined_zh.msgpack.gzjieba_zh.txtlarge_ar.msgpack.gzlarge_de.msgpack.gzlarge_en.msgpack.gzlarge_es.msgpack.gzlarge_fi.msgpack.gzlarge_fr.msgpack.gzlarge_it.msgpack.gzlarge_ja.msgpack.gzlarge_nl.msgpack.gzlarge_pl.msgpack.gzlarge_pt.msgpack.gzlarge_ru.msgpack.gzlarge_zh.msgpack.gztwitter_ar.msgpack.gztwitter_bg.msgpack.gztwitter_ca.msgpack.gztwitter_cs.msgpack.gztwitter_da.msgpack.gztwitter_de.msgpack.gztwitter_en.msgpack.gztwitter_es.msgpack.gztwitter_fa.msgpack.gztwitter_fi.msgpack.gztwitter_fr.msgpack.gztwitter_he.msgpack.gztwitter_hi.msgpack.gztwitter_hu.msgpack.gztwitter_id.msgpack.gztwitter_it.msgpack.gztwitter_ja.msgpack.gztwitter_ko.msgpack.gztwitter_ms.msgpack.gztwitter_nb.msgpack.gztwitter_nl.msgpack.gztwitter_pl.msgpack.gztwitter_pt.msgpack.gztwitter_ro.msgpack.gztwitter_ru.msgpack.gztwitter_sh.msgpack.gztwitter_sv.msgpack.gztwitter_tr.msgpack.gztwitter_uk.msgpack.gz
tokens.pytransliterate.py
wordfreq_builder

View File

@ -1,3 +1,26 @@
## Version 1.6.0 (2017-01-05)
- Support Czech, Persian, Ukrainian, and Croatian/Bosnian/Serbian
- Add large lists in Chinese, Finnish, Japanese, and Polish
- Data is now collected and built using Exquisite Corpus
(https://github.com/LuminosoInsight/exquisite-corpus)
- Add word frequencies from OPUS OpenSubtitles 2016
- Add word frequencies from the MOKK Hungarian Webcorpus
- Expand Google Books Ngrams data to cover 8 languages
- Expand language detection on Reddit to cover 13 languages with large enough
Reddit communities
- Drop the Common Crawl; we have enough good sources now that we don't have
to deal with all that spam
- Add automatic transliteration of Serbian text
- Adjust tokenization of apostrophes next to vowel sounds: the French word
"l'heure" is now tokenized similarly to "l'arc"
- Numbers longer than a single digit are smashed into the same word frequency,
to remove meaningless differences and increase compatibility with word2vec.
(Internally, their digits are replaced by zeroes.)
- Another new frequency-merging strategy (drop the highest and lowest,
average the rest)
## Version 1.5.1 (2016-08-19) ## Version 1.5.1 (2016-08-19)
- Bug fix: Made it possible to load the Japanese or Korean dictionary when the - Bug fix: Made it possible to load the Japanese or Korean dictionary when the

214
README.md
View File

@ -106,16 +106,16 @@ frequencies by a million (1e6) to get more readable numbers:
>>> from wordfreq import word_frequency >>> from wordfreq import word_frequency
>>> word_frequency('cafe', 'en') * 1e6 >>> word_frequency('cafe', 'en') * 1e6
12.88249551693135 11.748975549395302
>>> word_frequency('café', 'en') * 1e6 >>> word_frequency('café', 'en') * 1e6
3.3884415613920273 3.981071705534969
>>> word_frequency('cafe', 'fr') * 1e6 >>> word_frequency('cafe', 'fr') * 1e6
2.6302679918953817 1.4125375446227555
>>> word_frequency('café', 'fr') * 1e6 >>> word_frequency('café', 'fr') * 1e6
87.09635899560814 53.70317963702532
`zipf_frequency` is a variation on `word_frequency` that aims to return the `zipf_frequency` is a variation on `word_frequency` that aims to return the
@ -133,19 +133,19 @@ one occurrence per billion words.
>>> from wordfreq import zipf_frequency >>> from wordfreq import zipf_frequency
>>> zipf_frequency('the', 'en') >>> zipf_frequency('the', 'en')
7.67 7.75
>>> zipf_frequency('word', 'en') >>> zipf_frequency('word', 'en')
5.39 5.32
>>> zipf_frequency('frequency', 'en') >>> zipf_frequency('frequency', 'en')
4.19 4.36
>>> zipf_frequency('zipf', 'en') >>> zipf_frequency('zipf', 'en')
0.0 0.0
>>> zipf_frequency('zipf', 'en', wordlist='large') >>> zipf_frequency('zipf', 'en', wordlist='large')
1.65 1.28
The parameters to `word_frequency` and `zipf_frequency` are: The parameters to `word_frequency` and `zipf_frequency` are:
@ -175,10 +175,10 @@ the list, in descending frequency order.
>>> from wordfreq import top_n_list >>> from wordfreq import top_n_list
>>> top_n_list('en', 10) >>> top_n_list('en', 10)
['the', 'i', 'to', 'a', 'and', 'of', 'you', 'in', 'that', 'is'] ['the', 'to', 'of', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
>>> top_n_list('es', 10) >>> top_n_list('es', 10)
['de', 'que', 'la', 'y', 'a', 'en', 'el', 'no', 'los', 'es'] ['de', 'la', 'que', 'en', 'el', 'y', 'a', 'los', 'no', 'se']
`iter_wordlist(lang, wordlist='combined')` iterates through all the words in a `iter_wordlist(lang, wordlist='combined')` iterates through all the words in a
wordlist, in descending frequency order. wordlist, in descending frequency order.
@ -205,65 +205,79 @@ limiting the selection to words that can be typed in ASCII.
## Sources and supported languages ## Sources and supported languages
We compiled word frequencies from seven different sources, providing us This data comes from a Luminoso project called [Exquisite Corpus][xc], whose
examples of word usage on different topics at different levels of formality. goal is to download good, varied, multilingual corpus data, process it
The sources (and the abbreviations we'll use for them) are: appropriately, and combine it into unified resources such as wordfreq.
- **LeedsIC**: The Leeds Internet Corpus [xc]: https://github.com/LuminosoInsight/exquisite-corpus
- **SUBTLEX**: The SUBTLEX word frequency lists
- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
- **Twitter**: Messages sampled from Twitter's public stream
- **Wpedia**: The full text of Wikipedia in 2015
- **Reddit**: The corpus of Reddit comments through May 2015
- **CCrawl**: Text extracted from the Common Crawl and language-detected with cld2
- **Other**: We get additional English frequencies from Google Books Syntactic
Ngrams 2013, and Chinese frequencies from the frequency dictionary that
comes with the Jieba tokenizer.
The following 27 languages are supported, with reasonable tokenization and at Exquisite Corpus compiles 8 different domains of text, some of which themselves
come from multiple sources:
- **Wikipedia**, representing encyclopedic text
- **Subtitles**, from OPUS OpenSubtitles 2016 and SUBTLEX
- **News**, from NewsCrawl 2014 and GlobalVoices
- **Books**, from Google Books Ngrams 2012
- **Web** text, from the Leeds Internet Corpus and the MOKK Hungarian Webcorpus
- **Twitter**, representing short-form social media
- **Reddit**, representing potentially longer Internet comments
- **Miscellaneous** word frequencies: in Chinese, we import a free wordlist
that comes with the Jieba word segmenter, whose provenance we don't really know
The following languages are supported, with reasonable tokenization and at
least 3 different sources of word frequencies: least 3 different sources of word frequencies:
Language Code Sources Large? SUBTLEX OpenSub LeedsIC Twitter Wpedia CCrawl Reddit Other Language Code # Large? WP Subs News Books Web Twit. Redd. Misc.
───────────────────────────────────┼────────────────────────────────────────────────────────────── ──────────────────────────────┼────────────────────────────────────────────────
Arabic ar 5 Yes │ - Yes Yes Yes Yes Yes - - Arabic ar 5 Yes │ Yes Yes Yes - Yes Yes - -
Bulgarian bg 3 - │ - Yes - - Yes Yes - - Bosnian bs [1] 3 │ Yes Yes - - - Yes - -
Catalan ca 3 - │ - Yes - Yes Yes - - - Bulgarian bg 3 - │ Yes Yes - - - Yes - -
Danish da 3 - │ - Yes - - Yes Yes - - Catalan ca 4 - │ Yes Yes Yes - - Yes - -
German de 5 Yes │ Yes - Yes Yes Yes Yes - - Czech cs 3 - │ Yes Yes - - - Yes - -
Greek el 4 - │ - Yes Yes - Yes Yes - - Danish da 3 - │ Yes Yes - - - Yes - -
English en 7 Yes │ Yes Yes Yes Yes Yes - Yes Google Books German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Spanish es 6 Yes │ - Yes Yes Yes Yes Yes Yes - Greek el 3 - │ Yes Yes - - Yes - - -
Finnish fi 3 - │ - Yes - - Yes Yes - - English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
French fr 5 Yes │ - Yes Yes Yes Yes Yes - - Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Hebrew he 4 - │ - Yes - Yes Yes Yes - - Persian fa 3 - │ Yes Yes - - - Yes - -
Hindi hi 3 - │ - - - Yes Yes Yes - - Finnish fi 5 Yes │ Yes Yes Yes - - Yes Yes -
Hungarian hu 3 - │ - Yes - - Yes Yes - - French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Indonesian id 4 - │ - Yes - Yes Yes Yes - - Hebrew he 4 - │ Yes Yes - Yes - Yes - -
Italian it 5 Yes │ - Yes Yes Yes Yes Yes - - Hindi hi 3 - │ Yes - - - - Yes Yes -
Japanese ja 4 - │ - - Yes Yes Yes Yes - - Croatian hr [1] 3 │ Yes Yes - - - Yes - -
Korean ko 3 - │ - - - Yes Yes Yes - - Hungarian hu 3 - │ Yes Yes - - Yes - - -
Malay ms 4 - │ - Yes - Yes Yes Yes - - Indonesian id 3 - │ Yes Yes - - - Yes - -
Norwegian nb[1] 3 - │ - Yes - - Yes Yes - - Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Dutch nl 5 Yes │ Yes Yes - Yes Yes Yes - - Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
Polish pl 4 - │ - Yes - Yes Yes Yes - - Korean ko 4 - │ Yes Yes - - - Yes Yes -
Portuguese pt 5 Yes │ - Yes Yes Yes Yes Yes - - Malay ms 3 - │ Yes Yes - - - Yes - -
Romanian ro 3 - │ - Yes - - Yes Yes - - Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes -
Russian ru 5 Yes │ - Yes Yes Yes Yes Yes - - Dutch nl 4 Yes │ Yes Yes Yes - - Yes - -
Swedish sv 4 - │ - Yes - Yes Yes Yes - - Polish pl 5 Yes │ Yes Yes Yes - - Yes Yes -
Turkish tr 4 - │ - Yes - Yes Yes Yes - - Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
Chinese zh[2] 5 - │ Yes - Yes - Yes Yes - Jieba Romanian ro 3 - │ Yes Yes - - - Yes - -
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
Swedish sv 4 - │ Yes Yes - - - Yes Yes -
Turkish tr 3 - │ Yes Yes - - - Yes - -
Ukrainian uk 4 - │ Yes Yes - - - Yes Yes -
Chinese zh [3] 6 Yes │ Yes - Yes Yes Yes Yes - Jieba
[1] The Norwegian text we have is specifically written in Norwegian Bokmål, so [1] Bosnian, Croatian, and Serbian use the same underlying word list, because
we give it the language code 'nb'. We would use 'nn' for Nynorsk, but there they share most of their vocabulary and grammar, they were once considered the
isn't enough data to include it in wordfreq. same language, and language detection cannot distinguish them. This word list
can also be accessed with the language code `sh`.
[2] This data represents text written in both Simplified and Traditional [2] The Norwegian text we have is specifically written in Norwegian Bokmål, so
Chinese. (SUBTLEX is mostly Simplified, while Wikipedia is mostly Traditional.) we give it the language code 'nb' instead of the vaguer code 'no'. We would use
The characters are mapped to one another so they can use the same word 'nn' for Nynorsk, but there isn't enough data to include it in wordfreq.
frequency list.
[3] This data represents text written in both Simplified and Traditional
Chinese, with primarily Mandarin Chinese vocabulary. See "Multi-script
languages" below.
Some languages provide 'large' wordlists, including words with a Zipf frequency Some languages provide 'large' wordlists, including words with a Zipf frequency
between 1.0 and 3.0. These are available in 9 languages that are covered by between 1.0 and 3.0. These are available in 12 languages that are covered by
enough data sources. enough data sources.
@ -298,9 +312,9 @@ also try to deal gracefully when you query it with texts that actually break
into multiple tokens: into multiple tokens:
>>> zipf_frequency('New York', 'en') >>> zipf_frequency('New York', 'en')
5.07 5.35
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway" >>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
3.58 3.56
The word frequencies are combined with the half-harmonic-mean function in order The word frequencies are combined with the half-harmonic-mean function in order
to provide an estimate of what their combined frequency would be. In Chinese, to provide an estimate of what their combined frequency would be. In Chinese,
@ -315,7 +329,56 @@ you give it an uncommon combination of tokens, it will hugely over-estimate
their frequency: their frequency:
>>> zipf_frequency('owl-flavored', 'en') >>> zipf_frequency('owl-flavored', 'en')
3.19 3.18
## Multi-script languages
Two of the languages we support, Serbian and Chinese, are written in multiple
scripts. To avoid spurious differences in word frequencies, we automatically
transliterate the characters in these languages when looking up their words.
Serbian text written in Cyrillic letters is automatically converted to Latin
letters, using standard Serbian transliteration, when the requested language is
`sr` or `sh`. If you request the word list as `hr` (Croatian) or `bs`
(Bosnian), no transliteration will occur.
Chinese text is converted internally to a representation we call
"Oversimplified Chinese", where all Traditional Chinese characters are replaced
with their Simplified Chinese equivalent, *even if* they would not be written
that way in context. This representation lets us use a straightforward mapping
that matches both Traditional and Simplified words, unifying their frequencies
when appropriate, and does not appear to create clashes between unrelated words.
Enumerating the Chinese wordlist will produce some unfamiliar words, because
people don't actually write in Oversimplified Chinese, and because in
practice Traditional and Simplified Chinese also have different word usage.
## Similar, overlapping, and varying languages
As much as we would like to give each language its own distinct code and its
own distinct word list with distinct source data, there aren't actually sharp
boundaries between languages.
Sometimes, it's convenient to pretend that the boundaries between
languages coincide with national borders, following the maxim that "a language
is a dialect with an army and a navy" (Max Weinreich). This gets complicated
when the linguistic situation and the political situation diverge.
Moreover, some of our data sources rely on language detection, which of course
has no idea which country the writer of the text belongs to.
So we've had to make some arbitrary decisions about how to represent the
fuzzier language boundaries, such as those within Chinese, Malay, and
Croatian/Bosnian/Serbian. See [Language Log][] for some firsthand reports of
the mutual intelligibility or unintelligibility of languages.
[Language Log]: http://languagelog.ldc.upenn.edu/nll/?p=12633
Smoothing over our arbitrary decisions is the fact that we use the `langcodes`
module to find the best match for a language code. If you ask for word
frequencies in `cmn-Hans` (the fully specific language code for Mandarin in
Simplified Chinese), you will get the `zh` wordlist, for example.
## License ## License
@ -345,6 +408,10 @@ sources:
- Wikipedia, the free encyclopedia (http://www.wikipedia.org) - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
It contains data from OPUS OpenSubtitles 2016
(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
the OpenSubtitles project (http://www.opensubtitles.org/).
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
(see citations below) and available at (see citations below) and available at
@ -371,7 +438,8 @@ If you use wordfreq in your research, please cite it! We publish the code
through Zenodo so that it can be reliably cited using a DOI. The current through Zenodo so that it can be reliably cited using a DOI. The current
citation is: citation is:
> Robyn Speer, Joshua Chin, Andrew Lin, Lance Nathan, & Sara Jewett. (2016). wordfreq: v1.5.1 [Data set]. Zenodo. http://doi.org/10.5281/zenodo.61937 > Robyn Speer, Joshua Chin, Andrew Lin, Lance Nathan, & Sara Jewett. (2016).
> wordfreq: v1.5.1 [Data set]. Zenodo. http://doi.org/10.5281/zenodo.61937
The same citation in BibTex format: The same citation in BibTex format:
@ -393,6 +461,12 @@ The same citation in BibTex format:
## Citations to work that wordfreq is built on ## Citations to work that wordfreq is built on
- Bojar, O., Chatterjee, R., Federmann, C., Haddow, B., Huck, M., Hokamp, C.,
Koehn, P., Logacheva, V., Monz, C., Negri, M., Post, M., Scarton, C.,
Specia, L., & Turchi, M. (2015). Findings of the 2015 Workshop on Statistical
Machine Translation.
http://www.statmt.org/wmt15/results.html
- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical - Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical
Evaluation of Current Word Frequency Norms and the Introduction of a New and Evaluation of Current Word Frequency Norms and the Introduction of a New and
Improved Word Frequency Measure for American English. Behavior Research Improved Word Frequency Measure for American English. Behavior Research
@ -418,6 +492,11 @@ The same citation in BibTex format:
- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29. - Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29.
http://unicode.org/reports/tr29/ http://unicode.org/reports/tr29/
- Halácsy, P., Kornai, A., Németh, L., Rung, A., Szakadát, I., & Trón, V.
(2004). Creating open language resources for Hungarian. In Proceedings of the
4th international conference on Language Resources and Evaluation (LREC2004).
http://mokk.bme.hu/resources/webcorpus/
- Keuleers, E., Brysbaert, M. & New, B. (2010). SUBTLEX-NL: A new frequency - Keuleers, E., Brysbaert, M. & New, B. (2010). SUBTLEX-NL: A new frequency
measure for Dutch words based on film subtitles. Behavior Research Methods, measure for Dutch words based on film subtitles. Behavior Research Methods,
42(3), 643-650. 42(3), 643-650.
@ -427,6 +506,11 @@ The same citation in BibTex format:
analyzer. analyzer.
http://mecab.sourceforge.net/ http://mecab.sourceforge.net/
- Lison, P. and Tiedemann, J. (2016). OpenSubtitles2016: Extracting Large
Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th
International Conference on Language Resources and Evaluation (LREC 2016).
http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf
- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). - van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014).
SUBTLEX-UK: A new and improved word frequency database for British English. SUBTLEX-UK: A new and improved word frequency database for British English.
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190. The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.

View File

@ -1,41 +0,0 @@
""" This file generates a graph of the dependencies for the ninja build."""
import sys
import re
def ninja_to_dot():
def simplified_filename(path):
component = path.split('/')[-1]
return re.sub(
r'[0-9]+-of', 'NN-of',
re.sub(r'part[0-9]+', 'partNN', component)
)
print("digraph G {")
print('rankdir="LR";')
seen_edges = set()
for line in sys.stdin:
line = line.rstrip()
if line.startswith('build'):
# the output file is the first argument; strip off the colon that
# comes from ninja syntax
output_text, input_text = line.split(':')
outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]]
inputs = input_text.strip().split(' ')
infiles = [simplified_filename(part) for part in inputs[1:]]
operation = inputs[0]
for infile in infiles:
if infile == '|':
# external dependencies start here; let's not graph those
break
for outfile in outfiles:
edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)
if edge not in seen_edges:
seen_edges.add(edge)
print(edge)
print("}")
if __name__ == '__main__':
ninja_to_dot()

View File

@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
setup( setup(
name="wordfreq", name="wordfreq",
version='1.5.1', version='1.6',
maintainer='Luminoso Technologies, Inc.', maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com', maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -22,16 +22,19 @@ def test_freq_examples():
LAUGHTER_WORDS = { LAUGHTER_WORDS = {
'en': 'lol', 'en': 'lol',
'hi': 'lol', 'hi': 'lol',
'cs': 'lol',
'ru': 'лол', 'ru': 'лол',
'zh': '', 'zh': '',
'ja': '', 'ja': '',
'ar': '', 'ar': '',
'fa': 'خخخخ',
'ca': 'jaja', 'ca': 'jaja',
'es': 'jaja', 'es': 'jaja',
'fr': 'ptdr', 'fr': 'ptdr',
'pt': 'kkkk', 'pt': 'kkkk',
'he': 'חחח', 'he': 'חחח',
'bg': 'xaxa', 'bg': 'ахаха',
'uk': 'хаха',
} }
@ -77,7 +80,7 @@ def test_most_common_words():
""" """
return top_n_list(lang, 1)[0] return top_n_list(lang, 1)[0]
eq_(get_most_common('ar'), 'من') eq_(get_most_common('ar'), 'في')
eq_(get_most_common('de'), 'die') eq_(get_most_common('de'), 'die')
eq_(get_most_common('en'), 'the') eq_(get_most_common('en'), 'the')
eq_(get_most_common('es'), 'de') eq_(get_most_common('es'), 'de')
@ -85,6 +88,7 @@ def test_most_common_words():
eq_(get_most_common('it'), 'di') eq_(get_most_common('it'), 'di')
eq_(get_most_common('ja'), '') eq_(get_most_common('ja'), '')
eq_(get_most_common('nl'), 'de') eq_(get_most_common('nl'), 'de')
eq_(get_most_common('pl'), 'w')
eq_(get_most_common('pt'), 'de') eq_(get_most_common('pt'), 'de')
eq_(get_most_common('ru'), 'в') eq_(get_most_common('ru'), 'в')
eq_(get_most_common('tr'), 'bir') eq_(get_most_common('tr'), 'bir')
@ -141,6 +145,19 @@ def test_casefolding():
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca']) eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
def test_number_smashing():
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
['715', 'crσσks', 'by', 'bon', 'iver'])
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
['000', 'crσσks', 'by', 'bon', 'iver'])
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
def test_phrase_freq(): def test_phrase_freq():
ff = word_frequency("flip-flop", 'en') ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0) assert_greater(ff, 0)
@ -159,7 +176,7 @@ def test_not_really_random():
# This not only tests random_ascii_words, it makes sure we didn't end # This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word # up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0), eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'1 1 1 1') '00 00 00 00')
@raises(ValueError) @raises(ValueError)

View File

@ -25,8 +25,8 @@ def test_tokens():
eq_( eq_(
tokenize(fact_simplified, 'zh'), tokenize(fact_simplified, 'zh'),
[ [
# he / is / in history / #6 / counter for people # he / is / history / in / #6 / counter for people
'', '', '历史', '第六', '', '', '', '历史', '', '第六', '',
# during / term of office / in / die # during / term of office / in / die
'', '任期', '', '去世', '', '任期', '', '去世',
# of / U.S. / deputy / president # of / U.S. / deputy / president

View File

@ -0,0 +1,31 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency
def test_apostrophes():
# Test that we handle apostrophes in French reasonably.
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
["qu'", "un"])
eq_(tokenize("langues d'oïl", 'fr'),
['langues', "d", 'oïl'])
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
['langues', "d'", 'oïl'])
eq_(tokenize("l'heure", 'fr'),
['l', 'heure'])
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
["l'", 'heure'])
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
["l'", 'hôpital'])
eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
eq_(tokenize("This isn't French", 'en'),
['this', "isn't", 'french'])
def test_catastrophes():
# More apostrophes, but this time they're in Catalan, and there's other
# mid-word punctuation going on too.
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
['m', 'acabo', 'd', 'instal·lar'])
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
["m'", 'acabo', "d'", 'instal·lar', '.'])

25
tests/test_serbian.py Normal file
View File

@ -0,0 +1,25 @@
from nose.tools import eq_
from wordfreq import tokenize
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles)
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
def test_actually_russian():
# This looks mostly like Serbian, but was probably actually Russian.
# In Russian, Google Translate says it means:
# "a hundred out of a hundred, boys!"
#
# We make sure to handle this case so we don't end up with a mixed-script
# word like "pacanы".
eq_(tokenize("сто из ста, пацаны!", 'sr'),
['sto', 'iz', 'sta', 'pacany'])
eq_(tokenize("культуры", 'sr'), ["kul'tury"])

View File

@ -216,7 +216,7 @@ def iter_wordlist(lang, wordlist='combined'):
_wf_cache = {} _wf_cache = {}
def _word_frequency(word, lang, wordlist, minimum): def _word_frequency(word, lang, wordlist, minimum):
tokens = tokenize(word, lang) tokens = tokenize(word, lang, combine_numbers=True)
if not tokens: if not tokens:
return minimum return minimum

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,6 +1,6 @@
import regex import regex
import unicodedata import unicodedata
from .transliterate import serbian_cyrillic_to_latin
mecab_tokenize = None mecab_tokenize = None
jieba_tokenize = None jieba_tokenize = None
@ -22,7 +22,6 @@ ABJAD_LANGUAGES = {
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi' 'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
} }
def _make_spaceless_expr(): def _make_spaceless_expr():
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS] pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
return ''.join(pieces) return ''.join(pieces)
@ -60,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
# Case 2: standard Unicode segmentation # Case 2: standard Unicode segmentation
# ------------------------------------- # -------------------------------------
# The start of the token must not be a letter followed by «'h». If it is,
# we should use Case 3 to match up to the apostrophe, then match a new token
# starting with «h». This rule lets us break «l'heure» into two tokens, just
# like we would do for «l'arc».
(?!\w'[Hh])
# The start of the token must be 'word-like', not punctuation or whitespace # The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So # or various other things. However, we allow characters of category So
# (Symbol - Other) because many of these are emoji, which can convey # (Symbol - Other) because many of these are emoji, which can convey
@ -71,17 +77,29 @@ TOKEN_RE = regex.compile(r"""
# (\S) and do not cause word breaks according to the Unicode word # (\S) and do not cause word breaks according to the Unicode word
# segmentation heuristic (\B), or are categorized as Marks (\p{M}). # segmentation heuristic (\B), or are categorized as Marks (\p{M}).
(?:\B\S|\p{M})* (?:\B\S|\p{M})* |
# Case 3: Fix French
# ------------------
# This allows us to match the articles in French, Catalan, and related
# languages, such as «l'», that we may have excluded from being part of
# the token in Case 2.
\w'
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
[<SPACELESS>]+ | [<SPACELESS>]+ |
[\p{punct}]+ | [\p{punct}]+ |
\S(?:\B\S|\p{M})* (?!\w'[Hh]) \S(?:\B\S|\p{M})* |
\w'
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
DIGIT_RE = regex.compile('\d')
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
def simple_tokenize(text, include_punctuation=False): def simple_tokenize(text, include_punctuation=False):
""" """
@ -113,35 +131,17 @@ def simple_tokenize(text, include_punctuation=False):
would end up in its own token, which is worse. would end up in its own token, which is worse.
""" """
text = unicodedata.normalize('NFC', text) text = unicodedata.normalize('NFC', text)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE if include_punctuation:
return [token.strip("'").casefold() for token in token_expr.findall(text)]
def turkish_tokenize(text, include_punctuation=False):
"""
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
in Turkish, and modifies 'comma-below' characters to use cedillas.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [ return [
commas_to_cedillas(token.strip("'").casefold()) token.casefold()
for token in token_expr.findall(text) for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
] ]
else:
def romanian_tokenize(text, include_punctuation=False):
"""
Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
to use commas-below instead.
"""
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [ return [
cedillas_to_commas(token.strip("'").casefold()) token.strip("'").casefold()
for token in token_expr.findall(text) for token in TOKEN_RE.findall(text)
] ]
def tokenize_mecab_language(text, lang, include_punctuation=False): def tokenize_mecab_language(text, lang, include_punctuation=False):
""" """
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary. Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
@ -213,8 +213,48 @@ def cedillas_to_commas(text):
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}' '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
) )
def preprocess_turkish(text):
"""
Modifies i's so that they case-fold correctly in Turkish, and modifies
'comma-below' characters to use cedillas.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
return commas_to_cedillas(text.casefold())
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
def preprocess_romanian(text):
"""
Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
"""
return cedillas_to_commas(text.casefold())
def preprocess_serbian(text):
"""
Serbian is written in two scripts, so transliterate from Cyrillic to Latin
(which is the unambiguous direction).
"""
return serbian_cyrillic_to_latin(text)
def sub_zeroes(match):
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub('0', match.group(0))
def smash_numbers(text):
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
combine_numbers=False):
""" """
Tokenize this text in a way that's relatively simple but appropriate for Tokenize this text in a way that's relatively simple but appropriate for
the language. Strings that are looked up in wordfreq will be run through the language. Strings that are looked up in wordfreq will be run through
@ -229,6 +269,17 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
- CJK scripts: Chinese, Japanese, Korean - CJK scripts: Chinese, Japanese, Korean
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc. - Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
The options `include_punctuation`, `external_wordlist`, and
`combine_numbers` are passed on to the appropriate tokenizer:
- `include_punctuation` preserves punctuation as tokens, instead of
removing it.
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
instead of wordfreq's wordlist.
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
Alphabetic scripts Alphabetic scripts
------------------ ------------------
@ -310,17 +361,27 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
does not support these languages yet. It will split on spaces and does not support these languages yet. It will split on spaces and
punctuation, giving tokens that are far too long. punctuation, giving tokens that are far too long.
""" """
# A really simple way to handle language codes with more than just the
# language
lang = lang.split('-')[0]
if lang == 'ja' or lang == 'ko': if lang == 'ja' or lang == 'ko':
return tokenize_mecab_language(text, lang, include_punctuation) result = tokenize_mecab_language(text, lang, include_punctuation)
elif lang == 'zh': elif lang == 'zh':
return chinese_tokenize(text, include_punctuation, external_wordlist) result = chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr': elif lang == 'tr':
return turkish_tokenize(text, include_punctuation) result = simple_tokenize(preprocess_turkish(text), include_punctuation)
elif lang == 'ro': elif lang == 'ro':
return romanian_tokenize(text, include_punctuation) result = simple_tokenize(preprocess_romanian(text), include_punctuation)
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
# These are the three language codes that could include Serbian text,
# which could be in Cyrillic.
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
elif lang in ABJAD_LANGUAGES: elif lang in ABJAD_LANGUAGES:
text = remove_marks(unicodedata.normalize('NFKC', text)) text = remove_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text, include_punctuation) result = simple_tokenize(text, include_punctuation)
else: else:
return simple_tokenize(text, include_punctuation) result = simple_tokenize(text, include_punctuation)
if combine_numbers:
result = [smash_numbers(token) for token in result]
return result

70
wordfreq/transliterate.py Normal file
View File

@ -0,0 +1,70 @@
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
# from the 'cyrtranslit' module, which can't currently be imported in Python 3.
SR_CYRL_TO_LATN_DICT = {
ord('А'): 'A', ord('а'): 'a',
ord('Б'): 'B', ord('б'): 'b',
ord('В'): 'V', ord('в'): 'v',
ord('Г'): 'G', ord('г'): 'g',
ord('Д'): 'D', ord('д'): 'd',
ord('Ђ'): 'Đ', ord('ђ'): 'đ',
ord('Е'): 'E', ord('е'): 'e',
ord('Ж'): 'Ž', ord('ж'): 'ž',
ord('З'): 'Z', ord('з'): 'z',
ord('И'): 'I', ord('и'): 'i',
ord('Ј'): 'J', ord('ј'): 'j',
ord('К'): 'K', ord('к'): 'k',
ord('Л'): 'L', ord('л'): 'l',
ord('Љ'): 'Lj', ord('љ'): 'lj',
ord('М'): 'M', ord('м'): 'm',
ord('Н'): 'N', ord('н'): 'n',
ord('Њ'): 'Nj', ord('њ'): 'nj',
ord('О'): 'O', ord('о'): 'o',
ord('П'): 'P', ord('п'): 'p',
ord('Р'): 'R', ord('р'): 'r',
ord('С'): 'S', ord('с'): 's',
ord('Т'): 'T', ord('т'): 't',
ord('Ћ'): 'Ć', ord('ћ'): 'ć',
ord('У'): 'U', ord('у'): 'u',
ord('Ф'): 'F', ord('ф'): 'f',
ord('Х'): 'H', ord('х'): 'h',
ord('Ц'): 'C', ord('ц'): 'c',
ord('Ч'): 'Č', ord('ч'): 'č',
ord('Џ'): '', ord('џ'): '',
ord('Ш'): 'Š', ord('ш'): 'š',
# Handle Cyrillic letters from other languages. We hope these cases don't
# come up often when we're trying to transliterate Serbian, but if these
# letters show up in loan-words or code-switching text, we can at least
# transliterate them approximately instead of leaving them as Cyrillic
# letters surrounded by Latin.
# Russian letters
ord('Ё'): 'Jo', ord('ё'): 'Jo',
ord('Й'): 'J', ord('й'): 'j',
ord('Щ'): 'Šč', ord('щ'): 'šč',
ord('Ъ'): '', ord('ъ'): '',
ord('Ы'): 'Y', ord('ы'): 'y',
ord('Ь'): "'", ord('ь'): "'",
ord('Э'): 'E', ord('э'): 'e',
ord('Ю'): 'Ju', ord('ю'): 'ju',
ord('Я'): 'Ja', ord('я'): 'ja',
# Belarusian letter
ord('Ў'): 'Ŭ', ord('ў'): 'ŭ',
# Ukrainian letters
ord('Є'): 'Je', ord('є'): 'je',
ord('І'): 'I', ord('і'): 'i',
ord('Ї'): 'Ji', ord('ї'): 'ji',
ord('Ґ'): 'G', ord('ґ'): 'g',
# Macedonian letters
ord('Ѕ'): 'Dz', ord('ѕ'): 'dz',
ord('Ѓ'): 'Ǵ', ord('ѓ'): 'ǵ',
ord('Ќ'): '', ord('ќ'): '',
}
def serbian_cyrillic_to_latin(text):
return text.translate(SR_CYRL_TO_LATN_DICT)

View File

@ -1,12 +0,0 @@
*.pyc
__pycache__
.coverage
.idea
dist
*.egg-info
build
_build
build.ninja
data
.ninja_deps
.ninja_log

View File

@ -1,8 +0,0 @@
PYTHON = python
all: build.ninja
# build the Ninja file that will take over the build process
build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
$(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja

View File

@ -1,194 +0,0 @@
# wordfreq\_builder
This package builds the data files for [wordfreq](https://github.com/LuminosoInsight/wordfreq).
It requires a fair amount of external input data (42 GB of it, as of this
writing), which unfortunately we don't have a plan for how to distribute
outside of Luminoso yet.
The data can be publicly obtained in various ways, so here we'll at least
document where it comes from. We hope to come up with a process that's more
reproducible eventually.
The good news is that you don't need to be able to run this process to use
wordfreq. The built results are already in the `wordfreq/data` directory.
## How to build it
Set up your external hard disk, your networked file system, or whatever thing
you have that's got a couple hundred GB of space free. Let's suppose the
directory of it that you want to use is called `/ext/data`.
Get the input data. At Luminoso, this is available in the directory
`/nfs/broadway/data/wordfreq_builder`. The sections below explain where the
data comes from.
Copy the input data:
cp -rv /nfs/broadway/data/wordfreq_builder /ext/data/
Make a symbolic link so that `data/` in this directory points to
your copy of the input data:
ln -s /ext/data/wordfreq_builder data
Install the Ninja build system:
sudo apt-get install ninja-build
We need to build a Ninja build file using the Python code in
`wordfreq_builder/ninja.py`. We could do this with Ninja, but... you see the
chicken-and-egg problem, don't you. So this is the one thing the Makefile
knows how to do.
make
Start the build, and find something else to do for a few hours:
ninja -v
You can copy the results into wordfreq with this command:
cp data/dist/*.msgpack.gz ../wordfreq/data/
## The Ninja build process
Ninja is a lot like Make, except with one big {drawback|advantage}: instead of
writing bizarre expressions in an idiosyncratic language to let Make calculate
which files depend on which other files...
...you just tell Ninja which files depend on which other files.
The Ninja documentation suggests using your favorite scripting language to
create the dependency list, so that's what we've done in `ninja.py`.
Dependencies in Ninja refer to build rules. These do need to be written by hand
in Ninja's own format, but the task is simpler. In this project, the build
rules are defined in `rules.ninja`. They'll be concatenated with the
Python-generated dependency definitions to form the complete build file,
`build.ninja`, which is the default file that Ninja looks at when you run
`ninja`.
So a lot of the interesting work in this package is done in `rules.ninja`.
This file defines shorthand names for long commands. As a simple example,
the rule named `format_twitter` applies the command
python -m wordfreq_builder.cli.format_twitter $in $out
to the dependency file `$in` and the output file `$out`.
The specific rules are described by the comments in `rules.ninja`.
## Data sources
### Leeds Internet Corpus
Also known as the "Web as Corpus" project, this is a University of Leeds
project that collected wordlists in assorted languages by crawling the Web.
The results are messy, but they're something. We've been using them for quite
a while.
These files can be downloaded from the [Leeds corpus page][leeds].
The original files are in `data/source-lists/leeds`, and they're processed
by the `convert_leeds` rule in `rules.ninja`.
[leeds]: http://corpus.leeds.ac.uk/list.html
### Twitter
The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
collected by the `ftfy.streamtester` package in 2014.
We are not allowed to distribute the text of tweets. However, this process could
be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
a couple of weeks.
[ftfy]: https://github.com/LuminosoInsight/python-ftfy
### Google Books
We use English word frequencies from [Google Books Syntactic Ngrams][gbsn].
We pretty much ignore the syntactic information, and only use this version
because it's cleaner. The data comes in the form of 99 gzipped text files in
`data/raw-input/google-books`.
[gbsn]: http://commondatastorage.googleapis.com/books/syntactic-ngrams/index.html
### Wikipedia
Another source we use is the full text of Wikipedia in various languages. This
text can be difficult to extract efficiently, and for this purpose we use a
custom tool written in Nim 0.11, called [wiki2text][]. To build the Wikipedia
data, you need to separately install Nim and wiki2text.
The input data files are the XML dumps that can be found on the [Wikimedia
backup index][wikidumps]. For example, to get the latest Spanish data, go to
https://dumps.wikimedia.org/frwiki/latest and look for the filename of the form
`*.pages-articles.xml.bz2`. If this file isn't there, look for an older dump
where it is. You'll need to download such a file for each language that's
configured for Wikipedia in `wordfreq_builder/config.py`.
[wiki2text]: https://github.com/rspeer/wiki2text
[wikidumps]: https://dumps.wikimedia.org/backup-index.html
### OpenSubtitles
[Hermit Dave](https://invokeit.wordpress.com/frequency-word-lists/) made word
frequency lists out of the subtitle text on OpenSubtitles. This data was
used to make Wiktionary word frequency lists at one point, but it's been
updated significantly since the version Wiktionary got.
The wordlists are in `data/source-lists/opensubtitles`.
In order to fit into the wordfreq pipeline, we renamed lists with different variants
of the same language code, to distinguish them fully according to BCP 47. Then we
concatenated the different variants into a single list, as follows:
* `zh_tw.txt` was renamed to `zh-Hant.txt`
* `zh_cn.txt` was renamed to `zh-Hans.txt`
* `zh.txt` was renamed to `zh-Hani.txt`
* `zh-Hant.txt`, `zh-Hans.txt`, and `zh-Hani.txt` were concatenated into `zh.txt`
* `pt.txt` was renamed to `pt-PT.txt`
* `pt_br.txt` was renamed to `pt-BR.txt`
* `pt-BR.txt` and `pt-PT.txt` were concatenated into `pt.txt`
We also edited the English data to re-add "'t" to words that had obviously lost
it, such as "didn" in the place of "didn't". We applied this to words that
became much less common words in the process, which means this wordlist no
longer represents the words 'don' and 'won', as we assume most of their
frequency comes from "don't" and "won't". Words that turned into similarly
common words, however, were left alone: this list doesn't represent "can't"
because the word was left as "can".
### SUBTLEX
Marc Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in
wordfreq and derived works without the "academic use" restriction, under the
following reasonable conditions:
- Wordfreq and code derived from it must credit the SUBTLEX authors.
(See the citations in the top-level `README.md` file.)
- It must remain clear that SUBTLEX is freely available data.
`data/source-lists/subtlex` contains the following files:
- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
and exported from Excel format to tab-separated UTF-8 using LibreOffice
- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
extracted, and converted from ISO-8859-1 to UTF-8
- `subtlex.en-GB.txt`, which was downloaded as
[SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
tab-separated UTF-8 using LibreOffice
- `subtlex.nl.txt`, which was downloaded as
[SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
- `subtlex.zh.txt`, which was downloaded as
[subtlexch131210.zip][subtlex-ch] and extracted
[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip

Binary file not shown.

Before

(image error) Size: 1.9 MiB

Binary file not shown.

View File

@ -1,117 +0,0 @@
# This defines the rules on how to build parts of the wordfreq lists, using the
# Ninja build system:
#
# http://martine.github.io/ninja/manual.html
#
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
# better parallelism and the ability for build steps to produce multiple
# outputs. The tradeoff is that its rule syntax isn't full of magic for
# expanding wildcards and finding dependencies, so in general you have to
# write the dependencies using a script.
#
# This file will become the header of the larger build.ninja file, which also
# contains the programatically-defined dependency graph.
# Variables
JQ = lib/jq-linux64
# How to build the build.ninja file itself. (Use the Makefile to get it the
# first time.)
rule build_deps
command = python -m wordfreq_builder.cli.build_deps $in > $out
# Splits the single file $in into $slices parts, whose names will be
# $prefix plus a two-digit numeric suffix.
rule split
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
# https://github.com/rspeer/wiki2text.
rule wiki2text
command = bunzip2 -c $in | wiki2text > $out
# To tokenize Japanese, we run it through Mecab and take the first column.
rule tokenize_japanese
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
# Process Chinese by converting all Traditional Chinese characters to
# Simplified equivalents -- not because that's a good way to get readable
# text, but because that's how we're going to look them up.
rule simplify_chinese
command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
# Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step.
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
rule tokenize_reddit
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
# To convert the Leeds corpus, look for space-separated lines that start with
# an integer and a decimal. The integer is the rank, which we discard. The
# decimal is the frequency, and the remaining text is the term. Use sed -n
# with /p to output only lines where the match was successful.
#
# Grep out the term "EOS", an indication that Leeds used MeCab and didn't
# strip out the EOS lines.
rule convert_leeds
command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
# To convert the OpenSubtitles frequency data, simply replace spaces with
# commas.
rule convert_opensubtitles
command = tr ' ' ',' < $in > $out
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
# spaces, and remove lines with unfixable half-mojibake.
rule convert_subtlex
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
rule convert_jieba
command = cut -d ' ' -f 1,2 $in | grep -v '[,"]' | tr ' ' ',' > $out
rule counts_to_jieba
command = python -m wordfreq_builder.cli.counts_to_jieba $in $out
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
# the input files, keep only the single words and their counts, and only keep
# lines with counts of 100 or more.
#
# (These will still be repeated as the word appears in different grammatical
# roles, information that the source data provides that we're discarding. The
# source data was already filtered to only show words in roles with at least
# two-digit counts of occurences.)
rule convert_google_syntactic_ngrams
command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
rule count
command = python -m wordfreq_builder.cli.count_tokens $in $out
rule count_langtagged
command = python -m wordfreq_builder.cli.count_tokens_langtagged $in $out -l $language
rule merge
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out -b $buckets
rule cat
command = cat $in > $out
# A pipeline that extracts text from Reddit comments:
# - Unzip the input files
# - Select the body of comments, but only those whose Reddit score is positive
# (skipping the downvoted ones)
# - Skip deleted comments
# - Replace HTML escapes
rule extract_reddit
command = bunzip2 -c $in | $JQ -r 'select(.score > 0) | .body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out

View File

@ -1,13 +0,0 @@
from setuptools import setup
setup(
name="wordfreq_builder",
version='0.2',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq_builder',
platforms=["any"],
description="Turns raw data into word frequency lists",
packages=['wordfreq_builder'],
install_requires=['msgpack-python', 'pycld2', 'langcodes']
)

View File

@ -1,51 +0,0 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language
from nose.tools import eq_
def test_tokenizer_1():
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
tokens = [
'this', 'is', 'a', 'test', 'she', 'said',
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
'it', "won't", 'fail',
]
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_2():
text = "i use punctuation informally...see?like this."
tokens = [
'i', 'use', 'punctuation', 'informally', 'see',
'like', 'this'
]
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_3():
text = "@ExampleHandle This parser removes twitter handles!"
tokens = ['this', 'parser', 'removes', 'twitter', 'handles']
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_4():
text = "This is a really boring example tco http://t.co/n15ASlkase"
tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_language_recognizer_1():
text = "Il est le meilleur livre que je ai jamais lu"
result = cld2_detect_language(text)
eq_(result, 'fr')
def test_language_recognizer_2():
text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,
é uma nuvem esférica de planetesimais voláteis que se acredita
localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""
result = cld2_detect_language(text)
eq_(result, 'pt')

View File

@ -1,20 +0,0 @@
from wordfreq_builder.word_counts import URL_RE
from nose.tools import eq_
def check_url(url):
match = URL_RE.match(url)
assert match
eq_(match.span(), (0, len(url)))
def test_url_re():
# URLs like this are all over the Arabic Wikipedia. Here's one with the
# student ID blanked out.
yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
yield check_url, 'https://example.com/űnicode.html'
yield check_url, 'http://☃.net'
assert not URL_RE.match('ftp://127.0.0.1')

View File

@ -1,15 +0,0 @@
from wordfreq_builder.ninja import make_ninja_deps
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('in_filename', help='filename of rules file')
args = parser.parse_args()
# Make the complete ninja file and write it to standard out
make_ninja_deps(args.in_filename)
if __name__ == '__main__':
main()

View File

@ -1,15 +0,0 @@
from wordfreq_builder.word_counts import count_tokens, write_wordlist
import argparse
def handle_counts(filename_in, filename_out):
counts = count_tokens(filename_in)
write_wordlist(counts, filename_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out)

View File

@ -1,21 +0,0 @@
"""
Count tokens of text in a particular language, taking input from a
tab-separated file whose first column is a language code. Lines in all
languages except the specified one will be skipped.
"""
from wordfreq_builder.word_counts import count_tokens_langtagged, write_wordlist
import argparse
def handle_counts(filename_in, filename_out, lang):
counts = count_tokens_langtagged(filename_in, lang)
write_wordlist(counts, filename_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
parser.add_argument('-l', '--language', help='language tag to filter lines for')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out, args.language)

View File

@ -1,15 +0,0 @@
from wordfreq_builder.word_counts import read_values, write_jieba
import argparse
def handle_counts(filename_in, filename_out):
freqs, total = read_values(filename_in, cutoff=1e-6)
write_jieba(freqs, filename_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input wordlist')
parser.add_argument('filename_out', help='name of output Jieba-compatible wordlist')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out)

Some files were not shown because too many files have changed in this diff Show More