mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #47 from LuminosoInsight/all-1.6-changes
All 1.6 changes
This commit is contained in:
commit
72e3678e89
23
CHANGELOG.md
23
CHANGELOG.md
@ -1,3 +1,26 @@
|
|||||||
|
## Version 1.6.0 (2017-01-05)
|
||||||
|
|
||||||
|
- Support Czech, Persian, Ukrainian, and Croatian/Bosnian/Serbian
|
||||||
|
- Add large lists in Chinese, Finnish, Japanese, and Polish
|
||||||
|
- Data is now collected and built using Exquisite Corpus
|
||||||
|
(https://github.com/LuminosoInsight/exquisite-corpus)
|
||||||
|
- Add word frequencies from OPUS OpenSubtitles 2016
|
||||||
|
- Add word frequencies from the MOKK Hungarian Webcorpus
|
||||||
|
- Expand Google Books Ngrams data to cover 8 languages
|
||||||
|
- Expand language detection on Reddit to cover 13 languages with large enough
|
||||||
|
Reddit communities
|
||||||
|
- Drop the Common Crawl; we have enough good sources now that we don't have
|
||||||
|
to deal with all that spam
|
||||||
|
- Add automatic transliteration of Serbian text
|
||||||
|
- Adjust tokenization of apostrophes next to vowel sounds: the French word
|
||||||
|
"l'heure" is now tokenized similarly to "l'arc"
|
||||||
|
- Numbers longer than a single digit are smashed into the same word frequency,
|
||||||
|
to remove meaningless differences and increase compatibility with word2vec.
|
||||||
|
(Internally, their digits are replaced by zeroes.)
|
||||||
|
- Another new frequency-merging strategy (drop the highest and lowest,
|
||||||
|
average the rest)
|
||||||
|
|
||||||
|
|
||||||
## Version 1.5.1 (2016-08-19)
|
## Version 1.5.1 (2016-08-19)
|
||||||
|
|
||||||
- Bug fix: Made it possible to load the Japanese or Korean dictionary when the
|
- Bug fix: Made it possible to load the Japanese or Korean dictionary when the
|
||||||
|
214
README.md
214
README.md
@ -106,16 +106,16 @@ frequencies by a million (1e6) to get more readable numbers:
|
|||||||
|
|
||||||
>>> from wordfreq import word_frequency
|
>>> from wordfreq import word_frequency
|
||||||
>>> word_frequency('cafe', 'en') * 1e6
|
>>> word_frequency('cafe', 'en') * 1e6
|
||||||
12.88249551693135
|
11.748975549395302
|
||||||
|
|
||||||
>>> word_frequency('café', 'en') * 1e6
|
>>> word_frequency('café', 'en') * 1e6
|
||||||
3.3884415613920273
|
3.981071705534969
|
||||||
|
|
||||||
>>> word_frequency('cafe', 'fr') * 1e6
|
>>> word_frequency('cafe', 'fr') * 1e6
|
||||||
2.6302679918953817
|
1.4125375446227555
|
||||||
|
|
||||||
>>> word_frequency('café', 'fr') * 1e6
|
>>> word_frequency('café', 'fr') * 1e6
|
||||||
87.09635899560814
|
53.70317963702532
|
||||||
|
|
||||||
|
|
||||||
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
||||||
@ -133,19 +133,19 @@ one occurrence per billion words.
|
|||||||
|
|
||||||
>>> from wordfreq import zipf_frequency
|
>>> from wordfreq import zipf_frequency
|
||||||
>>> zipf_frequency('the', 'en')
|
>>> zipf_frequency('the', 'en')
|
||||||
7.67
|
7.75
|
||||||
|
|
||||||
>>> zipf_frequency('word', 'en')
|
>>> zipf_frequency('word', 'en')
|
||||||
5.39
|
5.32
|
||||||
|
|
||||||
>>> zipf_frequency('frequency', 'en')
|
>>> zipf_frequency('frequency', 'en')
|
||||||
4.19
|
4.36
|
||||||
|
|
||||||
>>> zipf_frequency('zipf', 'en')
|
>>> zipf_frequency('zipf', 'en')
|
||||||
0.0
|
0.0
|
||||||
|
|
||||||
>>> zipf_frequency('zipf', 'en', wordlist='large')
|
>>> zipf_frequency('zipf', 'en', wordlist='large')
|
||||||
1.65
|
1.28
|
||||||
|
|
||||||
|
|
||||||
The parameters to `word_frequency` and `zipf_frequency` are:
|
The parameters to `word_frequency` and `zipf_frequency` are:
|
||||||
@ -175,10 +175,10 @@ the list, in descending frequency order.
|
|||||||
|
|
||||||
>>> from wordfreq import top_n_list
|
>>> from wordfreq import top_n_list
|
||||||
>>> top_n_list('en', 10)
|
>>> top_n_list('en', 10)
|
||||||
['the', 'i', 'to', 'a', 'and', 'of', 'you', 'in', 'that', 'is']
|
['the', 'to', 'of', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
|
||||||
|
|
||||||
>>> top_n_list('es', 10)
|
>>> top_n_list('es', 10)
|
||||||
['de', 'que', 'la', 'y', 'a', 'en', 'el', 'no', 'los', 'es']
|
['de', 'la', 'que', 'en', 'el', 'y', 'a', 'los', 'no', 'se']
|
||||||
|
|
||||||
`iter_wordlist(lang, wordlist='combined')` iterates through all the words in a
|
`iter_wordlist(lang, wordlist='combined')` iterates through all the words in a
|
||||||
wordlist, in descending frequency order.
|
wordlist, in descending frequency order.
|
||||||
@ -205,65 +205,79 @@ limiting the selection to words that can be typed in ASCII.
|
|||||||
|
|
||||||
## Sources and supported languages
|
## Sources and supported languages
|
||||||
|
|
||||||
We compiled word frequencies from seven different sources, providing us
|
This data comes from a Luminoso project called [Exquisite Corpus][xc], whose
|
||||||
examples of word usage on different topics at different levels of formality.
|
goal is to download good, varied, multilingual corpus data, process it
|
||||||
The sources (and the abbreviations we'll use for them) are:
|
appropriately, and combine it into unified resources such as wordfreq.
|
||||||
|
|
||||||
- **LeedsIC**: The Leeds Internet Corpus
|
[xc]: https://github.com/LuminosoInsight/exquisite-corpus
|
||||||
- **SUBTLEX**: The SUBTLEX word frequency lists
|
|
||||||
- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
|
|
||||||
- **Twitter**: Messages sampled from Twitter's public stream
|
|
||||||
- **Wpedia**: The full text of Wikipedia in 2015
|
|
||||||
- **Reddit**: The corpus of Reddit comments through May 2015
|
|
||||||
- **CCrawl**: Text extracted from the Common Crawl and language-detected with cld2
|
|
||||||
- **Other**: We get additional English frequencies from Google Books Syntactic
|
|
||||||
Ngrams 2013, and Chinese frequencies from the frequency dictionary that
|
|
||||||
comes with the Jieba tokenizer.
|
|
||||||
|
|
||||||
The following 27 languages are supported, with reasonable tokenization and at
|
Exquisite Corpus compiles 8 different domains of text, some of which themselves
|
||||||
|
come from multiple sources:
|
||||||
|
|
||||||
|
- **Wikipedia**, representing encyclopedic text
|
||||||
|
- **Subtitles**, from OPUS OpenSubtitles 2016 and SUBTLEX
|
||||||
|
- **News**, from NewsCrawl 2014 and GlobalVoices
|
||||||
|
- **Books**, from Google Books Ngrams 2012
|
||||||
|
- **Web** text, from the Leeds Internet Corpus and the MOKK Hungarian Webcorpus
|
||||||
|
- **Twitter**, representing short-form social media
|
||||||
|
- **Reddit**, representing potentially longer Internet comments
|
||||||
|
- **Miscellaneous** word frequencies: in Chinese, we import a free wordlist
|
||||||
|
that comes with the Jieba word segmenter, whose provenance we don't really know
|
||||||
|
|
||||||
|
The following languages are supported, with reasonable tokenization and at
|
||||||
least 3 different sources of word frequencies:
|
least 3 different sources of word frequencies:
|
||||||
|
|
||||||
Language Code Sources Large? SUBTLEX OpenSub LeedsIC Twitter Wpedia CCrawl Reddit Other
|
Language Code # Large? WP Subs News Books Web Twit. Redd. Misc.
|
||||||
───────────────────────────────────┼──────────────────────────────────────────────────────────────
|
──────────────────────────────┼────────────────────────────────────────────────
|
||||||
Arabic ar 5 Yes │ - Yes Yes Yes Yes Yes - -
|
Arabic ar 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Bulgarian bg 3 - │ - Yes - - Yes Yes - -
|
Bosnian bs [1] 3 │ Yes Yes - - - Yes - -
|
||||||
Catalan ca 3 - │ - Yes - Yes Yes - - -
|
Bulgarian bg 3 - │ Yes Yes - - - Yes - -
|
||||||
Danish da 3 - │ - Yes - - Yes Yes - -
|
Catalan ca 4 - │ Yes Yes Yes - - Yes - -
|
||||||
German de 5 Yes │ Yes - Yes Yes Yes Yes - -
|
Czech cs 3 - │ Yes Yes - - - Yes - -
|
||||||
Greek el 4 - │ - Yes Yes - Yes Yes - -
|
Danish da 3 - │ Yes Yes - - - Yes - -
|
||||||
English en 7 Yes │ Yes Yes Yes Yes Yes - Yes Google Books
|
German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Spanish es 6 Yes │ - Yes Yes Yes Yes Yes Yes -
|
Greek el 3 - │ Yes Yes - - Yes - - -
|
||||||
Finnish fi 3 - │ - Yes - - Yes Yes - -
|
English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
French fr 5 Yes │ - Yes Yes Yes Yes Yes - -
|
Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Hebrew he 4 - │ - Yes - Yes Yes Yes - -
|
Persian fa 3 - │ Yes Yes - - - Yes - -
|
||||||
Hindi hi 3 - │ - - - Yes Yes Yes - -
|
Finnish fi 5 Yes │ Yes Yes Yes - - Yes Yes -
|
||||||
Hungarian hu 3 - │ - Yes - - Yes Yes - -
|
French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Indonesian id 4 - │ - Yes - Yes Yes Yes - -
|
Hebrew he 4 - │ Yes Yes - Yes - Yes - -
|
||||||
Italian it 5 Yes │ - Yes Yes Yes Yes Yes - -
|
Hindi hi 3 - │ Yes - - - - Yes Yes -
|
||||||
Japanese ja 4 - │ - - Yes Yes Yes Yes - -
|
Croatian hr [1] 3 │ Yes Yes - - - Yes - -
|
||||||
Korean ko 3 - │ - - - Yes Yes Yes - -
|
Hungarian hu 3 - │ Yes Yes - - Yes - - -
|
||||||
Malay ms 4 - │ - Yes - Yes Yes Yes - -
|
Indonesian id 3 - │ Yes Yes - - - Yes - -
|
||||||
Norwegian nb[1] 3 - │ - Yes - - Yes Yes - -
|
Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Dutch nl 5 Yes │ Yes Yes - Yes Yes Yes - -
|
Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||||
Polish pl 4 - │ - Yes - Yes Yes Yes - -
|
Korean ko 4 - │ Yes Yes - - - Yes Yes -
|
||||||
Portuguese pt 5 Yes │ - Yes Yes Yes Yes Yes - -
|
Malay ms 3 - │ Yes Yes - - - Yes - -
|
||||||
Romanian ro 3 - │ - Yes - - Yes Yes - -
|
Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes -
|
||||||
Russian ru 5 Yes │ - Yes Yes Yes Yes Yes - -
|
Dutch nl 4 Yes │ Yes Yes Yes - - Yes - -
|
||||||
Swedish sv 4 - │ - Yes - Yes Yes Yes - -
|
Polish pl 5 Yes │ Yes Yes Yes - - Yes Yes -
|
||||||
Turkish tr 4 - │ - Yes - Yes Yes Yes - -
|
Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Chinese zh[2] 5 - │ Yes - Yes - Yes Yes - Jieba
|
Romanian ro 3 - │ Yes Yes - - - Yes - -
|
||||||
|
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
|
||||||
|
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
|
||||||
|
Swedish sv 4 - │ Yes Yes - - - Yes Yes -
|
||||||
|
Turkish tr 3 - │ Yes Yes - - - Yes - -
|
||||||
|
Ukrainian uk 4 - │ Yes Yes - - - Yes Yes -
|
||||||
|
Chinese zh [3] 6 Yes │ Yes - Yes Yes Yes Yes - Jieba
|
||||||
|
|
||||||
[1] The Norwegian text we have is specifically written in Norwegian Bokmål, so
|
[1] Bosnian, Croatian, and Serbian use the same underlying word list, because
|
||||||
we give it the language code 'nb'. We would use 'nn' for Nynorsk, but there
|
they share most of their vocabulary and grammar, they were once considered the
|
||||||
isn't enough data to include it in wordfreq.
|
same language, and language detection cannot distinguish them. This word list
|
||||||
|
can also be accessed with the language code `sh`.
|
||||||
|
|
||||||
[2] This data represents text written in both Simplified and Traditional
|
[2] The Norwegian text we have is specifically written in Norwegian Bokmål, so
|
||||||
Chinese. (SUBTLEX is mostly Simplified, while Wikipedia is mostly Traditional.)
|
we give it the language code 'nb' instead of the vaguer code 'no'. We would use
|
||||||
The characters are mapped to one another so they can use the same word
|
'nn' for Nynorsk, but there isn't enough data to include it in wordfreq.
|
||||||
frequency list.
|
|
||||||
|
[3] This data represents text written in both Simplified and Traditional
|
||||||
|
Chinese, with primarily Mandarin Chinese vocabulary. See "Multi-script
|
||||||
|
languages" below.
|
||||||
|
|
||||||
Some languages provide 'large' wordlists, including words with a Zipf frequency
|
Some languages provide 'large' wordlists, including words with a Zipf frequency
|
||||||
between 1.0 and 3.0. These are available in 9 languages that are covered by
|
between 1.0 and 3.0. These are available in 12 languages that are covered by
|
||||||
enough data sources.
|
enough data sources.
|
||||||
|
|
||||||
|
|
||||||
@ -298,9 +312,9 @@ also try to deal gracefully when you query it with texts that actually break
|
|||||||
into multiple tokens:
|
into multiple tokens:
|
||||||
|
|
||||||
>>> zipf_frequency('New York', 'en')
|
>>> zipf_frequency('New York', 'en')
|
||||||
5.07
|
5.35
|
||||||
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||||
3.58
|
3.56
|
||||||
|
|
||||||
The word frequencies are combined with the half-harmonic-mean function in order
|
The word frequencies are combined with the half-harmonic-mean function in order
|
||||||
to provide an estimate of what their combined frequency would be. In Chinese,
|
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||||
@ -315,7 +329,56 @@ you give it an uncommon combination of tokens, it will hugely over-estimate
|
|||||||
their frequency:
|
their frequency:
|
||||||
|
|
||||||
>>> zipf_frequency('owl-flavored', 'en')
|
>>> zipf_frequency('owl-flavored', 'en')
|
||||||
3.19
|
3.18
|
||||||
|
|
||||||
|
|
||||||
|
## Multi-script languages
|
||||||
|
|
||||||
|
Two of the languages we support, Serbian and Chinese, are written in multiple
|
||||||
|
scripts. To avoid spurious differences in word frequencies, we automatically
|
||||||
|
transliterate the characters in these languages when looking up their words.
|
||||||
|
|
||||||
|
Serbian text written in Cyrillic letters is automatically converted to Latin
|
||||||
|
letters, using standard Serbian transliteration, when the requested language is
|
||||||
|
`sr` or `sh`. If you request the word list as `hr` (Croatian) or `bs`
|
||||||
|
(Bosnian), no transliteration will occur.
|
||||||
|
|
||||||
|
Chinese text is converted internally to a representation we call
|
||||||
|
"Oversimplified Chinese", where all Traditional Chinese characters are replaced
|
||||||
|
with their Simplified Chinese equivalent, *even if* they would not be written
|
||||||
|
that way in context. This representation lets us use a straightforward mapping
|
||||||
|
that matches both Traditional and Simplified words, unifying their frequencies
|
||||||
|
when appropriate, and does not appear to create clashes between unrelated words.
|
||||||
|
|
||||||
|
Enumerating the Chinese wordlist will produce some unfamiliar words, because
|
||||||
|
people don't actually write in Oversimplified Chinese, and because in
|
||||||
|
practice Traditional and Simplified Chinese also have different word usage.
|
||||||
|
|
||||||
|
|
||||||
|
## Similar, overlapping, and varying languages
|
||||||
|
|
||||||
|
As much as we would like to give each language its own distinct code and its
|
||||||
|
own distinct word list with distinct source data, there aren't actually sharp
|
||||||
|
boundaries between languages.
|
||||||
|
|
||||||
|
Sometimes, it's convenient to pretend that the boundaries between
|
||||||
|
languages coincide with national borders, following the maxim that "a language
|
||||||
|
is a dialect with an army and a navy" (Max Weinreich). This gets complicated
|
||||||
|
when the linguistic situation and the political situation diverge.
|
||||||
|
Moreover, some of our data sources rely on language detection, which of course
|
||||||
|
has no idea which country the writer of the text belongs to.
|
||||||
|
|
||||||
|
So we've had to make some arbitrary decisions about how to represent the
|
||||||
|
fuzzier language boundaries, such as those within Chinese, Malay, and
|
||||||
|
Croatian/Bosnian/Serbian. See [Language Log][] for some firsthand reports of
|
||||||
|
the mutual intelligibility or unintelligibility of languages.
|
||||||
|
|
||||||
|
[Language Log]: http://languagelog.ldc.upenn.edu/nll/?p=12633
|
||||||
|
|
||||||
|
Smoothing over our arbitrary decisions is the fact that we use the `langcodes`
|
||||||
|
module to find the best match for a language code. If you ask for word
|
||||||
|
frequencies in `cmn-Hans` (the fully specific language code for Mandarin in
|
||||||
|
Simplified Chinese), you will get the `zh` wordlist, for example.
|
||||||
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
@ -345,6 +408,10 @@ sources:
|
|||||||
|
|
||||||
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
||||||
|
|
||||||
|
It contains data from OPUS OpenSubtitles 2016
|
||||||
|
(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
|
||||||
|
the OpenSubtitles project (http://www.opensubtitles.org/).
|
||||||
|
|
||||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
||||||
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
||||||
(see citations below) and available at
|
(see citations below) and available at
|
||||||
@ -371,7 +438,8 @@ If you use wordfreq in your research, please cite it! We publish the code
|
|||||||
through Zenodo so that it can be reliably cited using a DOI. The current
|
through Zenodo so that it can be reliably cited using a DOI. The current
|
||||||
citation is:
|
citation is:
|
||||||
|
|
||||||
> Robyn Speer, Joshua Chin, Andrew Lin, Lance Nathan, & Sara Jewett. (2016). wordfreq: v1.5.1 [Data set]. Zenodo. http://doi.org/10.5281/zenodo.61937
|
> Robyn Speer, Joshua Chin, Andrew Lin, Lance Nathan, & Sara Jewett. (2016).
|
||||||
|
> wordfreq: v1.5.1 [Data set]. Zenodo. http://doi.org/10.5281/zenodo.61937
|
||||||
|
|
||||||
The same citation in BibTex format:
|
The same citation in BibTex format:
|
||||||
|
|
||||||
@ -393,6 +461,12 @@ The same citation in BibTex format:
|
|||||||
|
|
||||||
## Citations to work that wordfreq is built on
|
## Citations to work that wordfreq is built on
|
||||||
|
|
||||||
|
- Bojar, O., Chatterjee, R., Federmann, C., Haddow, B., Huck, M., Hokamp, C.,
|
||||||
|
Koehn, P., Logacheva, V., Monz, C., Negri, M., Post, M., Scarton, C.,
|
||||||
|
Specia, L., & Turchi, M. (2015). Findings of the 2015 Workshop on Statistical
|
||||||
|
Machine Translation.
|
||||||
|
http://www.statmt.org/wmt15/results.html
|
||||||
|
|
||||||
- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical
|
- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical
|
||||||
Evaluation of Current Word Frequency Norms and the Introduction of a New and
|
Evaluation of Current Word Frequency Norms and the Introduction of a New and
|
||||||
Improved Word Frequency Measure for American English. Behavior Research
|
Improved Word Frequency Measure for American English. Behavior Research
|
||||||
@ -418,6 +492,11 @@ The same citation in BibTex format:
|
|||||||
- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29.
|
- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29.
|
||||||
http://unicode.org/reports/tr29/
|
http://unicode.org/reports/tr29/
|
||||||
|
|
||||||
|
- Halácsy, P., Kornai, A., Németh, L., Rung, A., Szakadát, I., & Trón, V.
|
||||||
|
(2004). Creating open language resources for Hungarian. In Proceedings of the
|
||||||
|
4th international conference on Language Resources and Evaluation (LREC2004).
|
||||||
|
http://mokk.bme.hu/resources/webcorpus/
|
||||||
|
|
||||||
- Keuleers, E., Brysbaert, M. & New, B. (2010). SUBTLEX-NL: A new frequency
|
- Keuleers, E., Brysbaert, M. & New, B. (2010). SUBTLEX-NL: A new frequency
|
||||||
measure for Dutch words based on film subtitles. Behavior Research Methods,
|
measure for Dutch words based on film subtitles. Behavior Research Methods,
|
||||||
42(3), 643-650.
|
42(3), 643-650.
|
||||||
@ -427,6 +506,11 @@ The same citation in BibTex format:
|
|||||||
analyzer.
|
analyzer.
|
||||||
http://mecab.sourceforge.net/
|
http://mecab.sourceforge.net/
|
||||||
|
|
||||||
|
- Lison, P. and Tiedemann, J. (2016). OpenSubtitles2016: Extracting Large
|
||||||
|
Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th
|
||||||
|
International Conference on Language Resources and Evaluation (LREC 2016).
|
||||||
|
http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf
|
||||||
|
|
||||||
- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014).
|
- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014).
|
||||||
SUBTLEX-UK: A new and improved word frequency database for British English.
|
SUBTLEX-UK: A new and improved word frequency database for British English.
|
||||||
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
|
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
|
||||||
|
@ -1,41 +0,0 @@
|
|||||||
""" This file generates a graph of the dependencies for the ninja build."""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def ninja_to_dot():
|
|
||||||
def simplified_filename(path):
|
|
||||||
component = path.split('/')[-1]
|
|
||||||
return re.sub(
|
|
||||||
r'[0-9]+-of', 'NN-of',
|
|
||||||
re.sub(r'part[0-9]+', 'partNN', component)
|
|
||||||
)
|
|
||||||
|
|
||||||
print("digraph G {")
|
|
||||||
print('rankdir="LR";')
|
|
||||||
seen_edges = set()
|
|
||||||
for line in sys.stdin:
|
|
||||||
line = line.rstrip()
|
|
||||||
if line.startswith('build'):
|
|
||||||
# the output file is the first argument; strip off the colon that
|
|
||||||
# comes from ninja syntax
|
|
||||||
output_text, input_text = line.split(':')
|
|
||||||
outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]]
|
|
||||||
inputs = input_text.strip().split(' ')
|
|
||||||
infiles = [simplified_filename(part) for part in inputs[1:]]
|
|
||||||
operation = inputs[0]
|
|
||||||
for infile in infiles:
|
|
||||||
if infile == '|':
|
|
||||||
# external dependencies start here; let's not graph those
|
|
||||||
break
|
|
||||||
for outfile in outfiles:
|
|
||||||
edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)
|
|
||||||
if edge not in seen_edges:
|
|
||||||
seen_edges.add(edge)
|
|
||||||
print(edge)
|
|
||||||
print("}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
ninja_to_dot()
|
|
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='1.5.1',
|
version='1.6',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -22,16 +22,19 @@ def test_freq_examples():
|
|||||||
LAUGHTER_WORDS = {
|
LAUGHTER_WORDS = {
|
||||||
'en': 'lol',
|
'en': 'lol',
|
||||||
'hi': 'lol',
|
'hi': 'lol',
|
||||||
|
'cs': 'lol',
|
||||||
'ru': 'лол',
|
'ru': 'лол',
|
||||||
'zh': '笑',
|
'zh': '笑',
|
||||||
'ja': '笑',
|
'ja': '笑',
|
||||||
'ar': 'ﻪﻬﻬﻬﻫ',
|
'ar': 'ﻪﻬﻬﻬﻫ',
|
||||||
|
'fa': 'خخخخ',
|
||||||
'ca': 'jaja',
|
'ca': 'jaja',
|
||||||
'es': 'jaja',
|
'es': 'jaja',
|
||||||
'fr': 'ptdr',
|
'fr': 'ptdr',
|
||||||
'pt': 'kkkk',
|
'pt': 'kkkk',
|
||||||
'he': 'חחח',
|
'he': 'חחח',
|
||||||
'bg': 'xaxa',
|
'bg': 'ахаха',
|
||||||
|
'uk': 'хаха',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -77,7 +80,7 @@ def test_most_common_words():
|
|||||||
"""
|
"""
|
||||||
return top_n_list(lang, 1)[0]
|
return top_n_list(lang, 1)[0]
|
||||||
|
|
||||||
eq_(get_most_common('ar'), 'من')
|
eq_(get_most_common('ar'), 'في')
|
||||||
eq_(get_most_common('de'), 'die')
|
eq_(get_most_common('de'), 'die')
|
||||||
eq_(get_most_common('en'), 'the')
|
eq_(get_most_common('en'), 'the')
|
||||||
eq_(get_most_common('es'), 'de')
|
eq_(get_most_common('es'), 'de')
|
||||||
@ -85,6 +88,7 @@ def test_most_common_words():
|
|||||||
eq_(get_most_common('it'), 'di')
|
eq_(get_most_common('it'), 'di')
|
||||||
eq_(get_most_common('ja'), 'の')
|
eq_(get_most_common('ja'), 'の')
|
||||||
eq_(get_most_common('nl'), 'de')
|
eq_(get_most_common('nl'), 'de')
|
||||||
|
eq_(get_most_common('pl'), 'w')
|
||||||
eq_(get_most_common('pt'), 'de')
|
eq_(get_most_common('pt'), 'de')
|
||||||
eq_(get_most_common('ru'), 'в')
|
eq_(get_most_common('ru'), 'в')
|
||||||
eq_(get_most_common('tr'), 'bir')
|
eq_(get_most_common('tr'), 'bir')
|
||||||
@ -141,6 +145,19 @@ def test_casefolding():
|
|||||||
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_number_smashing():
|
||||||
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||||
|
['715', 'crσσks', 'by', 'bon', 'iver'])
|
||||||
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
|
||||||
|
['000', 'crσσks', 'by', 'bon', 'iver'])
|
||||||
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
|
||||||
|
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
||||||
|
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
|
||||||
|
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
|
||||||
|
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
|
||||||
|
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_freq():
|
def test_phrase_freq():
|
||||||
ff = word_frequency("flip-flop", 'en')
|
ff = word_frequency("flip-flop", 'en')
|
||||||
assert_greater(ff, 0)
|
assert_greater(ff, 0)
|
||||||
@ -159,7 +176,7 @@ def test_not_really_random():
|
|||||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||||
# up with 'eos' as a very common Japanese word
|
# up with 'eos' as a very common Japanese word
|
||||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||||
'1 1 1 1')
|
'00 00 00 00')
|
||||||
|
|
||||||
|
|
||||||
@raises(ValueError)
|
@raises(ValueError)
|
||||||
|
@ -25,8 +25,8 @@ def test_tokens():
|
|||||||
eq_(
|
eq_(
|
||||||
tokenize(fact_simplified, 'zh'),
|
tokenize(fact_simplified, 'zh'),
|
||||||
[
|
[
|
||||||
# he / is / in history / #6 / counter for people
|
# he / is / history / in / #6 / counter for people
|
||||||
'他', '是', '历史上', '第六', '位',
|
'他', '是', '历史', '上', '第六', '位',
|
||||||
# during / term of office / in / die
|
# during / term of office / in / die
|
||||||
'在', '任期', '内', '去世',
|
'在', '任期', '内', '去世',
|
||||||
# of / U.S. / deputy / president
|
# of / U.S. / deputy / president
|
||||||
|
31
tests/test_french_and_related.py
Normal file
31
tests/test_french_and_related.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from nose.tools import eq_, assert_almost_equal
|
||||||
|
from wordfreq import tokenize, word_frequency
|
||||||
|
|
||||||
|
|
||||||
|
def test_apostrophes():
|
||||||
|
# Test that we handle apostrophes in French reasonably.
|
||||||
|
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
|
||||||
|
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
|
||||||
|
["qu'", "un"])
|
||||||
|
eq_(tokenize("langues d'oïl", 'fr'),
|
||||||
|
['langues', "d", 'oïl'])
|
||||||
|
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
|
||||||
|
['langues', "d'", 'oïl'])
|
||||||
|
eq_(tokenize("l'heure", 'fr'),
|
||||||
|
['l', 'heure'])
|
||||||
|
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
|
||||||
|
["l'", 'heure'])
|
||||||
|
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
|
||||||
|
["l'", 'hôpital'])
|
||||||
|
eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
|
||||||
|
eq_(tokenize("This isn't French", 'en'),
|
||||||
|
['this', "isn't", 'french'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_catastrophes():
|
||||||
|
# More apostrophes, but this time they're in Catalan, and there's other
|
||||||
|
# mid-word punctuation going on too.
|
||||||
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
||||||
|
['m', 'acabo', 'd', 'instal·lar'])
|
||||||
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
||||||
|
["m'", 'acabo', "d'", 'instal·lar', '.'])
|
25
tests/test_serbian.py
Normal file
25
tests/test_serbian.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from nose.tools import eq_
|
||||||
|
from wordfreq import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
def test_transliteration():
|
||||||
|
# "Well, there's a lot of things you do not understand."
|
||||||
|
# (from somewhere in OpenSubtitles)
|
||||||
|
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
|
||||||
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||||
|
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
||||||
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_actually_russian():
|
||||||
|
# This looks mostly like Serbian, but was probably actually Russian.
|
||||||
|
# In Russian, Google Translate says it means:
|
||||||
|
# "a hundred out of a hundred, boys!"
|
||||||
|
#
|
||||||
|
# We make sure to handle this case so we don't end up with a mixed-script
|
||||||
|
# word like "pacanы".
|
||||||
|
|
||||||
|
eq_(tokenize("сто из ста, пацаны!", 'sr'),
|
||||||
|
['sto', 'iz', 'sta', 'pacany'])
|
||||||
|
|
||||||
|
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
|
@ -216,7 +216,7 @@ def iter_wordlist(lang, wordlist='combined'):
|
|||||||
_wf_cache = {}
|
_wf_cache = {}
|
||||||
|
|
||||||
def _word_frequency(word, lang, wordlist, minimum):
|
def _word_frequency(word, lang, wordlist, minimum):
|
||||||
tokens = tokenize(word, lang)
|
tokens = tokenize(word, lang, combine_numbers=True)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return minimum
|
return minimum
|
||||||
|
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_cs.msgpack.gz
Normal file
BIN
wordfreq/data/combined_cs.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_fa.msgpack.gz
Normal file
BIN
wordfreq/data/combined_fa.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_sh.msgpack.gz
Normal file
BIN
wordfreq/data/combined_sh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_uk.msgpack.gz
Normal file
BIN
wordfreq/data/combined_uk.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_fi.msgpack.gz
Normal file
BIN
wordfreq/data/large_fi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_ja.msgpack.gz
Normal file
BIN
wordfreq/data/large_ja.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_pl.msgpack.gz
Normal file
BIN
wordfreq/data/large_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_zh.msgpack.gz
Normal file
BIN
wordfreq/data/large_zh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_bg.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_bg.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_cs.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_cs.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_da.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_da.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_fa.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fa.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_fi.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_hu.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_hu.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_nb.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_nb.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_ro.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ro.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_sh.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_sh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_uk.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_uk.msgpack.gz
Normal file
Binary file not shown.
@ -1,6 +1,6 @@
|
|||||||
import regex
|
import regex
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
from .transliterate import serbian_cyrillic_to_latin
|
||||||
|
|
||||||
mecab_tokenize = None
|
mecab_tokenize = None
|
||||||
jieba_tokenize = None
|
jieba_tokenize = None
|
||||||
@ -22,7 +22,6 @@ ABJAD_LANGUAGES = {
|
|||||||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||||||
return ''.join(pieces)
|
return ''.join(pieces)
|
||||||
@ -60,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
# Case 2: standard Unicode segmentation
|
# Case 2: standard Unicode segmentation
|
||||||
# -------------------------------------
|
# -------------------------------------
|
||||||
|
|
||||||
|
# The start of the token must not be a letter followed by «'h». If it is,
|
||||||
|
# we should use Case 3 to match up to the apostrophe, then match a new token
|
||||||
|
# starting with «h». This rule lets us break «l'heure» into two tokens, just
|
||||||
|
# like we would do for «l'arc».
|
||||||
|
|
||||||
|
(?!\w'[Hh])
|
||||||
|
|
||||||
# The start of the token must be 'word-like', not punctuation or whitespace
|
# The start of the token must be 'word-like', not punctuation or whitespace
|
||||||
# or various other things. However, we allow characters of category So
|
# or various other things. However, we allow characters of category So
|
||||||
# (Symbol - Other) because many of these are emoji, which can convey
|
# (Symbol - Other) because many of these are emoji, which can convey
|
||||||
@ -71,17 +77,29 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
# (\S) and do not cause word breaks according to the Unicode word
|
# (\S) and do not cause word breaks according to the Unicode word
|
||||||
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
||||||
|
|
||||||
(?:\B\S|\p{M})*
|
(?:\B\S|\p{M})* |
|
||||||
|
|
||||||
|
# Case 3: Fix French
|
||||||
|
# ------------------
|
||||||
|
# This allows us to match the articles in French, Catalan, and related
|
||||||
|
# languages, such as «l'», that we may have excluded from being part of
|
||||||
|
# the token in Case 2.
|
||||||
|
|
||||||
|
\w'
|
||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||||
[<SPACELESS>]+ |
|
[<SPACELESS>]+ |
|
||||||
[\p{punct}]+ |
|
[\p{punct}]+ |
|
||||||
\S(?:\B\S|\p{M})*
|
(?!\w'[Hh]) \S(?:\B\S|\p{M})* |
|
||||||
|
\w'
|
||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
DIGIT_RE = regex.compile('\d')
|
||||||
|
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text, include_punctuation=False):
|
def simple_tokenize(text, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
@ -113,35 +131,17 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
would end up in its own token, which is worse.
|
would end up in its own token, which is worse.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
if include_punctuation:
|
||||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
|
||||||
|
|
||||||
|
|
||||||
def turkish_tokenize(text, include_punctuation=False):
|
|
||||||
"""
|
|
||||||
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
|
||||||
in Turkish, and modifies 'comma-below' characters to use cedillas.
|
|
||||||
"""
|
|
||||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
|
||||||
return [
|
return [
|
||||||
commas_to_cedillas(token.strip("'").casefold())
|
token.casefold()
|
||||||
for token in token_expr.findall(text)
|
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
|
||||||
def romanian_tokenize(text, include_punctuation=False):
|
|
||||||
"""
|
|
||||||
Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
|
|
||||||
to use commas-below instead.
|
|
||||||
"""
|
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
|
||||||
return [
|
return [
|
||||||
cedillas_to_commas(token.strip("'").casefold())
|
token.strip("'").casefold()
|
||||||
for token in token_expr.findall(text)
|
for token in TOKEN_RE.findall(text)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||||
@ -213,8 +213,48 @@ def cedillas_to_commas(text):
|
|||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def preprocess_turkish(text):
|
||||||
|
"""
|
||||||
|
Modifies i's so that they case-fold correctly in Turkish, and modifies
|
||||||
|
'comma-below' characters to use cedillas.
|
||||||
|
"""
|
||||||
|
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||||
|
return commas_to_cedillas(text.casefold())
|
||||||
|
|
||||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|
||||||
|
def preprocess_romanian(text):
|
||||||
|
"""
|
||||||
|
Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
|
||||||
|
"""
|
||||||
|
return cedillas_to_commas(text.casefold())
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_serbian(text):
|
||||||
|
"""
|
||||||
|
Serbian is written in two scripts, so transliterate from Cyrillic to Latin
|
||||||
|
(which is the unambiguous direction).
|
||||||
|
"""
|
||||||
|
return serbian_cyrillic_to_latin(text)
|
||||||
|
|
||||||
|
|
||||||
|
def sub_zeroes(match):
|
||||||
|
"""
|
||||||
|
Given a regex match, return what it matched with digits replaced by
|
||||||
|
zeroes.
|
||||||
|
"""
|
||||||
|
return DIGIT_RE.sub('0', match.group(0))
|
||||||
|
|
||||||
|
|
||||||
|
def smash_numbers(text):
|
||||||
|
"""
|
||||||
|
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||||
|
distinguish the frequencies of thousands of numbers.
|
||||||
|
"""
|
||||||
|
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
||||||
|
combine_numbers=False):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
the language. Strings that are looked up in wordfreq will be run through
|
the language. Strings that are looked up in wordfreq will be run through
|
||||||
@ -229,6 +269,17 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
- CJK scripts: Chinese, Japanese, Korean
|
- CJK scripts: Chinese, Japanese, Korean
|
||||||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
||||||
|
|
||||||
|
The options `include_punctuation`, `external_wordlist`, and
|
||||||
|
`combine_numbers` are passed on to the appropriate tokenizer:
|
||||||
|
|
||||||
|
- `include_punctuation` preserves punctuation as tokens, instead of
|
||||||
|
removing it.
|
||||||
|
|
||||||
|
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
|
||||||
|
instead of wordfreq's wordlist.
|
||||||
|
|
||||||
|
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
|
||||||
|
|
||||||
|
|
||||||
Alphabetic scripts
|
Alphabetic scripts
|
||||||
------------------
|
------------------
|
||||||
@ -310,17 +361,27 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
does not support these languages yet. It will split on spaces and
|
does not support these languages yet. It will split on spaces and
|
||||||
punctuation, giving tokens that are far too long.
|
punctuation, giving tokens that are far too long.
|
||||||
"""
|
"""
|
||||||
|
# A really simple way to handle language codes with more than just the
|
||||||
|
# language
|
||||||
|
lang = lang.split('-')[0]
|
||||||
if lang == 'ja' or lang == 'ko':
|
if lang == 'ja' or lang == 'ko':
|
||||||
return tokenize_mecab_language(text, lang, include_punctuation)
|
result = tokenize_mecab_language(text, lang, include_punctuation)
|
||||||
elif lang == 'zh':
|
elif lang == 'zh':
|
||||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
result = chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||||
elif lang == 'tr':
|
elif lang == 'tr':
|
||||||
return turkish_tokenize(text, include_punctuation)
|
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
return romanian_tokenize(text, include_punctuation)
|
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
|
||||||
|
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
|
||||||
|
# These are the three language codes that could include Serbian text,
|
||||||
|
# which could be in Cyrillic.
|
||||||
|
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
|
||||||
elif lang in ABJAD_LANGUAGES:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
return simple_tokenize(text, include_punctuation)
|
result = simple_tokenize(text, include_punctuation)
|
||||||
else:
|
else:
|
||||||
return simple_tokenize(text, include_punctuation)
|
result = simple_tokenize(text, include_punctuation)
|
||||||
|
|
||||||
|
if combine_numbers:
|
||||||
|
result = [smash_numbers(token) for token in result]
|
||||||
|
return result
|
||||||
|
70
wordfreq/transliterate.py
Normal file
70
wordfreq/transliterate.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
|
||||||
|
# from the 'cyrtranslit' module, which can't currently be imported in Python 3.
|
||||||
|
SR_CYRL_TO_LATN_DICT = {
|
||||||
|
ord('А'): 'A', ord('а'): 'a',
|
||||||
|
ord('Б'): 'B', ord('б'): 'b',
|
||||||
|
ord('В'): 'V', ord('в'): 'v',
|
||||||
|
ord('Г'): 'G', ord('г'): 'g',
|
||||||
|
ord('Д'): 'D', ord('д'): 'd',
|
||||||
|
ord('Ђ'): 'Đ', ord('ђ'): 'đ',
|
||||||
|
ord('Е'): 'E', ord('е'): 'e',
|
||||||
|
ord('Ж'): 'Ž', ord('ж'): 'ž',
|
||||||
|
ord('З'): 'Z', ord('з'): 'z',
|
||||||
|
ord('И'): 'I', ord('и'): 'i',
|
||||||
|
ord('Ј'): 'J', ord('ј'): 'j',
|
||||||
|
ord('К'): 'K', ord('к'): 'k',
|
||||||
|
ord('Л'): 'L', ord('л'): 'l',
|
||||||
|
ord('Љ'): 'Lj', ord('љ'): 'lj',
|
||||||
|
ord('М'): 'M', ord('м'): 'm',
|
||||||
|
ord('Н'): 'N', ord('н'): 'n',
|
||||||
|
ord('Њ'): 'Nj', ord('њ'): 'nj',
|
||||||
|
ord('О'): 'O', ord('о'): 'o',
|
||||||
|
ord('П'): 'P', ord('п'): 'p',
|
||||||
|
ord('Р'): 'R', ord('р'): 'r',
|
||||||
|
ord('С'): 'S', ord('с'): 's',
|
||||||
|
ord('Т'): 'T', ord('т'): 't',
|
||||||
|
ord('Ћ'): 'Ć', ord('ћ'): 'ć',
|
||||||
|
ord('У'): 'U', ord('у'): 'u',
|
||||||
|
ord('Ф'): 'F', ord('ф'): 'f',
|
||||||
|
ord('Х'): 'H', ord('х'): 'h',
|
||||||
|
ord('Ц'): 'C', ord('ц'): 'c',
|
||||||
|
ord('Ч'): 'Č', ord('ч'): 'č',
|
||||||
|
ord('Џ'): 'Dž', ord('џ'): 'dž',
|
||||||
|
ord('Ш'): 'Š', ord('ш'): 'š',
|
||||||
|
|
||||||
|
# Handle Cyrillic letters from other languages. We hope these cases don't
|
||||||
|
# come up often when we're trying to transliterate Serbian, but if these
|
||||||
|
# letters show up in loan-words or code-switching text, we can at least
|
||||||
|
# transliterate them approximately instead of leaving them as Cyrillic
|
||||||
|
# letters surrounded by Latin.
|
||||||
|
|
||||||
|
# Russian letters
|
||||||
|
ord('Ё'): 'Jo', ord('ё'): 'Jo',
|
||||||
|
ord('Й'): 'J', ord('й'): 'j',
|
||||||
|
ord('Щ'): 'Šč', ord('щ'): 'šč',
|
||||||
|
ord('Ъ'): '', ord('ъ'): '',
|
||||||
|
ord('Ы'): 'Y', ord('ы'): 'y',
|
||||||
|
ord('Ь'): "'", ord('ь'): "'",
|
||||||
|
ord('Э'): 'E', ord('э'): 'e',
|
||||||
|
ord('Ю'): 'Ju', ord('ю'): 'ju',
|
||||||
|
ord('Я'): 'Ja', ord('я'): 'ja',
|
||||||
|
|
||||||
|
# Belarusian letter
|
||||||
|
ord('Ў'): 'Ŭ', ord('ў'): 'ŭ',
|
||||||
|
|
||||||
|
# Ukrainian letters
|
||||||
|
ord('Є'): 'Je', ord('є'): 'je',
|
||||||
|
ord('І'): 'I', ord('і'): 'i',
|
||||||
|
ord('Ї'): 'Ji', ord('ї'): 'ji',
|
||||||
|
ord('Ґ'): 'G', ord('ґ'): 'g',
|
||||||
|
|
||||||
|
# Macedonian letters
|
||||||
|
ord('Ѕ'): 'Dz', ord('ѕ'): 'dz',
|
||||||
|
ord('Ѓ'): 'Ǵ', ord('ѓ'): 'ǵ',
|
||||||
|
ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def serbian_cyrillic_to_latin(text):
|
||||||
|
return text.translate(SR_CYRL_TO_LATN_DICT)
|
||||||
|
|
12
wordfreq_builder/.gitignore
vendored
12
wordfreq_builder/.gitignore
vendored
@ -1,12 +0,0 @@
|
|||||||
*.pyc
|
|
||||||
__pycache__
|
|
||||||
.coverage
|
|
||||||
.idea
|
|
||||||
dist
|
|
||||||
*.egg-info
|
|
||||||
build
|
|
||||||
_build
|
|
||||||
build.ninja
|
|
||||||
data
|
|
||||||
.ninja_deps
|
|
||||||
.ninja_log
|
|
@ -1,8 +0,0 @@
|
|||||||
PYTHON = python
|
|
||||||
|
|
||||||
all: build.ninja
|
|
||||||
|
|
||||||
# build the Ninja file that will take over the build process
|
|
||||||
build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
|
|
||||||
$(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja
|
|
||||||
|
|
@ -1,194 +0,0 @@
|
|||||||
# wordfreq\_builder
|
|
||||||
|
|
||||||
This package builds the data files for [wordfreq](https://github.com/LuminosoInsight/wordfreq).
|
|
||||||
|
|
||||||
It requires a fair amount of external input data (42 GB of it, as of this
|
|
||||||
writing), which unfortunately we don't have a plan for how to distribute
|
|
||||||
outside of Luminoso yet.
|
|
||||||
|
|
||||||
The data can be publicly obtained in various ways, so here we'll at least
|
|
||||||
document where it comes from. We hope to come up with a process that's more
|
|
||||||
reproducible eventually.
|
|
||||||
|
|
||||||
The good news is that you don't need to be able to run this process to use
|
|
||||||
wordfreq. The built results are already in the `wordfreq/data` directory.
|
|
||||||
|
|
||||||
## How to build it
|
|
||||||
|
|
||||||
Set up your external hard disk, your networked file system, or whatever thing
|
|
||||||
you have that's got a couple hundred GB of space free. Let's suppose the
|
|
||||||
directory of it that you want to use is called `/ext/data`.
|
|
||||||
|
|
||||||
Get the input data. At Luminoso, this is available in the directory
|
|
||||||
`/nfs/broadway/data/wordfreq_builder`. The sections below explain where the
|
|
||||||
data comes from.
|
|
||||||
|
|
||||||
Copy the input data:
|
|
||||||
|
|
||||||
cp -rv /nfs/broadway/data/wordfreq_builder /ext/data/
|
|
||||||
|
|
||||||
Make a symbolic link so that `data/` in this directory points to
|
|
||||||
your copy of the input data:
|
|
||||||
|
|
||||||
ln -s /ext/data/wordfreq_builder data
|
|
||||||
|
|
||||||
Install the Ninja build system:
|
|
||||||
|
|
||||||
sudo apt-get install ninja-build
|
|
||||||
|
|
||||||
We need to build a Ninja build file using the Python code in
|
|
||||||
`wordfreq_builder/ninja.py`. We could do this with Ninja, but... you see the
|
|
||||||
chicken-and-egg problem, don't you. So this is the one thing the Makefile
|
|
||||||
knows how to do.
|
|
||||||
|
|
||||||
make
|
|
||||||
|
|
||||||
Start the build, and find something else to do for a few hours:
|
|
||||||
|
|
||||||
ninja -v
|
|
||||||
|
|
||||||
You can copy the results into wordfreq with this command:
|
|
||||||
|
|
||||||
cp data/dist/*.msgpack.gz ../wordfreq/data/
|
|
||||||
|
|
||||||
|
|
||||||
## The Ninja build process
|
|
||||||
|
|
||||||
Ninja is a lot like Make, except with one big {drawback|advantage}: instead of
|
|
||||||
writing bizarre expressions in an idiosyncratic language to let Make calculate
|
|
||||||
which files depend on which other files...
|
|
||||||
|
|
||||||
...you just tell Ninja which files depend on which other files.
|
|
||||||
|
|
||||||
The Ninja documentation suggests using your favorite scripting language to
|
|
||||||
create the dependency list, so that's what we've done in `ninja.py`.
|
|
||||||
|
|
||||||
Dependencies in Ninja refer to build rules. These do need to be written by hand
|
|
||||||
in Ninja's own format, but the task is simpler. In this project, the build
|
|
||||||
rules are defined in `rules.ninja`. They'll be concatenated with the
|
|
||||||
Python-generated dependency definitions to form the complete build file,
|
|
||||||
`build.ninja`, which is the default file that Ninja looks at when you run
|
|
||||||
`ninja`.
|
|
||||||
|
|
||||||
So a lot of the interesting work in this package is done in `rules.ninja`.
|
|
||||||
This file defines shorthand names for long commands. As a simple example,
|
|
||||||
the rule named `format_twitter` applies the command
|
|
||||||
|
|
||||||
python -m wordfreq_builder.cli.format_twitter $in $out
|
|
||||||
|
|
||||||
to the dependency file `$in` and the output file `$out`.
|
|
||||||
|
|
||||||
The specific rules are described by the comments in `rules.ninja`.
|
|
||||||
|
|
||||||
## Data sources
|
|
||||||
|
|
||||||
### Leeds Internet Corpus
|
|
||||||
|
|
||||||
Also known as the "Web as Corpus" project, this is a University of Leeds
|
|
||||||
project that collected wordlists in assorted languages by crawling the Web.
|
|
||||||
The results are messy, but they're something. We've been using them for quite
|
|
||||||
a while.
|
|
||||||
|
|
||||||
These files can be downloaded from the [Leeds corpus page][leeds].
|
|
||||||
|
|
||||||
The original files are in `data/source-lists/leeds`, and they're processed
|
|
||||||
by the `convert_leeds` rule in `rules.ninja`.
|
|
||||||
|
|
||||||
[leeds]: http://corpus.leeds.ac.uk/list.html
|
|
||||||
|
|
||||||
### Twitter
|
|
||||||
|
|
||||||
The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
|
|
||||||
collected by the `ftfy.streamtester` package in 2014.
|
|
||||||
|
|
||||||
We are not allowed to distribute the text of tweets. However, this process could
|
|
||||||
be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
|
|
||||||
a couple of weeks.
|
|
||||||
|
|
||||||
[ftfy]: https://github.com/LuminosoInsight/python-ftfy
|
|
||||||
|
|
||||||
### Google Books
|
|
||||||
|
|
||||||
We use English word frequencies from [Google Books Syntactic Ngrams][gbsn].
|
|
||||||
We pretty much ignore the syntactic information, and only use this version
|
|
||||||
because it's cleaner. The data comes in the form of 99 gzipped text files in
|
|
||||||
`data/raw-input/google-books`.
|
|
||||||
|
|
||||||
[gbsn]: http://commondatastorage.googleapis.com/books/syntactic-ngrams/index.html
|
|
||||||
|
|
||||||
### Wikipedia
|
|
||||||
|
|
||||||
Another source we use is the full text of Wikipedia in various languages. This
|
|
||||||
text can be difficult to extract efficiently, and for this purpose we use a
|
|
||||||
custom tool written in Nim 0.11, called [wiki2text][]. To build the Wikipedia
|
|
||||||
data, you need to separately install Nim and wiki2text.
|
|
||||||
|
|
||||||
The input data files are the XML dumps that can be found on the [Wikimedia
|
|
||||||
backup index][wikidumps]. For example, to get the latest Spanish data, go to
|
|
||||||
https://dumps.wikimedia.org/frwiki/latest and look for the filename of the form
|
|
||||||
`*.pages-articles.xml.bz2`. If this file isn't there, look for an older dump
|
|
||||||
where it is. You'll need to download such a file for each language that's
|
|
||||||
configured for Wikipedia in `wordfreq_builder/config.py`.
|
|
||||||
|
|
||||||
[wiki2text]: https://github.com/rspeer/wiki2text
|
|
||||||
[wikidumps]: https://dumps.wikimedia.org/backup-index.html
|
|
||||||
|
|
||||||
### OpenSubtitles
|
|
||||||
|
|
||||||
[Hermit Dave](https://invokeit.wordpress.com/frequency-word-lists/) made word
|
|
||||||
frequency lists out of the subtitle text on OpenSubtitles. This data was
|
|
||||||
used to make Wiktionary word frequency lists at one point, but it's been
|
|
||||||
updated significantly since the version Wiktionary got.
|
|
||||||
|
|
||||||
The wordlists are in `data/source-lists/opensubtitles`.
|
|
||||||
|
|
||||||
In order to fit into the wordfreq pipeline, we renamed lists with different variants
|
|
||||||
of the same language code, to distinguish them fully according to BCP 47. Then we
|
|
||||||
concatenated the different variants into a single list, as follows:
|
|
||||||
|
|
||||||
* `zh_tw.txt` was renamed to `zh-Hant.txt`
|
|
||||||
* `zh_cn.txt` was renamed to `zh-Hans.txt`
|
|
||||||
* `zh.txt` was renamed to `zh-Hani.txt`
|
|
||||||
* `zh-Hant.txt`, `zh-Hans.txt`, and `zh-Hani.txt` were concatenated into `zh.txt`
|
|
||||||
* `pt.txt` was renamed to `pt-PT.txt`
|
|
||||||
* `pt_br.txt` was renamed to `pt-BR.txt`
|
|
||||||
* `pt-BR.txt` and `pt-PT.txt` were concatenated into `pt.txt`
|
|
||||||
|
|
||||||
We also edited the English data to re-add "'t" to words that had obviously lost
|
|
||||||
it, such as "didn" in the place of "didn't". We applied this to words that
|
|
||||||
became much less common words in the process, which means this wordlist no
|
|
||||||
longer represents the words 'don' and 'won', as we assume most of their
|
|
||||||
frequency comes from "don't" and "won't". Words that turned into similarly
|
|
||||||
common words, however, were left alone: this list doesn't represent "can't"
|
|
||||||
because the word was left as "can".
|
|
||||||
|
|
||||||
### SUBTLEX
|
|
||||||
|
|
||||||
Marc Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in
|
|
||||||
wordfreq and derived works without the "academic use" restriction, under the
|
|
||||||
following reasonable conditions:
|
|
||||||
|
|
||||||
- Wordfreq and code derived from it must credit the SUBTLEX authors.
|
|
||||||
(See the citations in the top-level `README.md` file.)
|
|
||||||
- It must remain clear that SUBTLEX is freely available data.
|
|
||||||
|
|
||||||
`data/source-lists/subtlex` contains the following files:
|
|
||||||
|
|
||||||
- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
|
|
||||||
and exported from Excel format to tab-separated UTF-8 using LibreOffice
|
|
||||||
- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
|
|
||||||
extracted, and converted from ISO-8859-1 to UTF-8
|
|
||||||
- `subtlex.en-GB.txt`, which was downloaded as
|
|
||||||
[SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
|
|
||||||
tab-separated UTF-8 using LibreOffice
|
|
||||||
- `subtlex.nl.txt`, which was downloaded as
|
|
||||||
[SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
|
|
||||||
- `subtlex.zh.txt`, which was downloaded as
|
|
||||||
[subtlexch131210.zip][subtlex-ch] and extracted
|
|
||||||
|
|
||||||
[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
|
|
||||||
[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
|
|
||||||
[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
|
|
||||||
[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
|
|
||||||
[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
|
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 1.9 MiB |
Binary file not shown.
@ -1,117 +0,0 @@
|
|||||||
# This defines the rules on how to build parts of the wordfreq lists, using the
|
|
||||||
# Ninja build system:
|
|
||||||
#
|
|
||||||
# http://martine.github.io/ninja/manual.html
|
|
||||||
#
|
|
||||||
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
|
|
||||||
# better parallelism and the ability for build steps to produce multiple
|
|
||||||
# outputs. The tradeoff is that its rule syntax isn't full of magic for
|
|
||||||
# expanding wildcards and finding dependencies, so in general you have to
|
|
||||||
# write the dependencies using a script.
|
|
||||||
#
|
|
||||||
# This file will become the header of the larger build.ninja file, which also
|
|
||||||
# contains the programatically-defined dependency graph.
|
|
||||||
|
|
||||||
# Variables
|
|
||||||
JQ = lib/jq-linux64
|
|
||||||
|
|
||||||
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
|
||||||
# first time.)
|
|
||||||
rule build_deps
|
|
||||||
command = python -m wordfreq_builder.cli.build_deps $in > $out
|
|
||||||
|
|
||||||
# Splits the single file $in into $slices parts, whose names will be
|
|
||||||
# $prefix plus a two-digit numeric suffix.
|
|
||||||
rule split
|
|
||||||
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
|
|
||||||
|
|
||||||
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
|
|
||||||
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
|
|
||||||
# https://github.com/rspeer/wiki2text.
|
|
||||||
rule wiki2text
|
|
||||||
command = bunzip2 -c $in | wiki2text > $out
|
|
||||||
|
|
||||||
# To tokenize Japanese, we run it through Mecab and take the first column.
|
|
||||||
rule tokenize_japanese
|
|
||||||
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
|
|
||||||
|
|
||||||
# Process Chinese by converting all Traditional Chinese characters to
|
|
||||||
# Simplified equivalents -- not because that's a good way to get readable
|
|
||||||
# text, but because that's how we're going to look them up.
|
|
||||||
rule simplify_chinese
|
|
||||||
command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
|
|
||||||
|
|
||||||
# Tokenizing text from Twitter requires us to language-detect and tokenize
|
|
||||||
# in the same step.
|
|
||||||
rule tokenize_twitter
|
|
||||||
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
|
|
||||||
|
|
||||||
rule tokenize_reddit
|
|
||||||
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
|
|
||||||
|
|
||||||
# To convert the Leeds corpus, look for space-separated lines that start with
|
|
||||||
# an integer and a decimal. The integer is the rank, which we discard. The
|
|
||||||
# decimal is the frequency, and the remaining text is the term. Use sed -n
|
|
||||||
# with /p to output only lines where the match was successful.
|
|
||||||
#
|
|
||||||
# Grep out the term "EOS", an indication that Leeds used MeCab and didn't
|
|
||||||
# strip out the EOS lines.
|
|
||||||
rule convert_leeds
|
|
||||||
command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
|
|
||||||
|
|
||||||
# To convert the OpenSubtitles frequency data, simply replace spaces with
|
|
||||||
# commas.
|
|
||||||
rule convert_opensubtitles
|
|
||||||
command = tr ' ' ',' < $in > $out
|
|
||||||
|
|
||||||
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
|
|
||||||
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
|
|
||||||
# spaces, and remove lines with unfixable half-mojibake.
|
|
||||||
rule convert_subtlex
|
|
||||||
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
|
||||||
|
|
||||||
rule convert_jieba
|
|
||||||
command = cut -d ' ' -f 1,2 $in | grep -v '[,"]' | tr ' ' ',' > $out
|
|
||||||
|
|
||||||
rule counts_to_jieba
|
|
||||||
command = python -m wordfreq_builder.cli.counts_to_jieba $in $out
|
|
||||||
|
|
||||||
|
|
||||||
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
|
||||||
# the input files, keep only the single words and their counts, and only keep
|
|
||||||
# lines with counts of 100 or more.
|
|
||||||
#
|
|
||||||
# (These will still be repeated as the word appears in different grammatical
|
|
||||||
# roles, information that the source data provides that we're discarding. The
|
|
||||||
# source data was already filtered to only show words in roles with at least
|
|
||||||
# two-digit counts of occurences.)
|
|
||||||
rule convert_google_syntactic_ngrams
|
|
||||||
command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
|
|
||||||
|
|
||||||
rule count
|
|
||||||
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
|
||||||
|
|
||||||
rule count_langtagged
|
|
||||||
command = python -m wordfreq_builder.cli.count_tokens_langtagged $in $out -l $language
|
|
||||||
|
|
||||||
rule merge
|
|
||||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
|
||||||
|
|
||||||
rule merge_counts
|
|
||||||
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
|
||||||
|
|
||||||
rule freqs2cB
|
|
||||||
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out -b $buckets
|
|
||||||
|
|
||||||
rule cat
|
|
||||||
command = cat $in > $out
|
|
||||||
|
|
||||||
# A pipeline that extracts text from Reddit comments:
|
|
||||||
# - Unzip the input files
|
|
||||||
# - Select the body of comments, but only those whose Reddit score is positive
|
|
||||||
# (skipping the downvoted ones)
|
|
||||||
# - Skip deleted comments
|
|
||||||
# - Replace HTML escapes
|
|
||||||
rule extract_reddit
|
|
||||||
command = bunzip2 -c $in | $JQ -r 'select(.score > 0) | .body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
|||||||
from setuptools import setup
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="wordfreq_builder",
|
|
||||||
version='0.2',
|
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
|
||||||
maintainer_email='info@luminoso.com',
|
|
||||||
url='http://github.com/LuminosoInsight/wordfreq_builder',
|
|
||||||
platforms=["any"],
|
|
||||||
description="Turns raw data into word frequency lists",
|
|
||||||
packages=['wordfreq_builder'],
|
|
||||||
install_requires=['msgpack-python', 'pycld2', 'langcodes']
|
|
||||||
)
|
|
@ -1,51 +0,0 @@
|
|||||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language
|
|
||||||
from nose.tools import eq_
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_1():
|
|
||||||
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
|
||||||
tokens = [
|
|
||||||
'this', 'is', 'a', 'test', 'she', 'said',
|
|
||||||
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
|
|
||||||
'it', "won't", 'fail',
|
|
||||||
]
|
|
||||||
result = cld2_surface_tokenizer(text)
|
|
||||||
eq_(result[1], tokens)
|
|
||||||
eq_(result[0], 'en')
|
|
||||||
|
|
||||||
def test_tokenizer_2():
|
|
||||||
text = "i use punctuation informally...see?like this."
|
|
||||||
tokens = [
|
|
||||||
'i', 'use', 'punctuation', 'informally', 'see',
|
|
||||||
'like', 'this'
|
|
||||||
]
|
|
||||||
result = cld2_surface_tokenizer(text)
|
|
||||||
eq_(result[1], tokens)
|
|
||||||
eq_(result[0], 'en')
|
|
||||||
|
|
||||||
def test_tokenizer_3():
|
|
||||||
text = "@ExampleHandle This parser removes twitter handles!"
|
|
||||||
tokens = ['this', 'parser', 'removes', 'twitter', 'handles']
|
|
||||||
result = cld2_surface_tokenizer(text)
|
|
||||||
eq_(result[1], tokens)
|
|
||||||
eq_(result[0], 'en')
|
|
||||||
|
|
||||||
def test_tokenizer_4():
|
|
||||||
text = "This is a really boring example tco http://t.co/n15ASlkase"
|
|
||||||
tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']
|
|
||||||
result = cld2_surface_tokenizer(text)
|
|
||||||
eq_(result[1], tokens)
|
|
||||||
eq_(result[0], 'en')
|
|
||||||
|
|
||||||
|
|
||||||
def test_language_recognizer_1():
|
|
||||||
text = "Il est le meilleur livre que je ai jamais lu"
|
|
||||||
result = cld2_detect_language(text)
|
|
||||||
eq_(result, 'fr')
|
|
||||||
|
|
||||||
def test_language_recognizer_2():
|
|
||||||
text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,
|
|
||||||
é uma nuvem esférica de planetesimais voláteis que se acredita
|
|
||||||
localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""
|
|
||||||
result = cld2_detect_language(text)
|
|
||||||
eq_(result, 'pt')
|
|
@ -1,20 +0,0 @@
|
|||||||
from wordfreq_builder.word_counts import URL_RE
|
|
||||||
from nose.tools import eq_
|
|
||||||
|
|
||||||
|
|
||||||
def check_url(url):
|
|
||||||
match = URL_RE.match(url)
|
|
||||||
assert match
|
|
||||||
eq_(match.span(), (0, len(url)))
|
|
||||||
|
|
||||||
|
|
||||||
def test_url_re():
|
|
||||||
# URLs like this are all over the Arabic Wikipedia. Here's one with the
|
|
||||||
# student ID blanked out.
|
|
||||||
yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
|
|
||||||
|
|
||||||
yield check_url, 'https://example.com/űnicode.html'
|
|
||||||
yield check_url, 'http://☃.net'
|
|
||||||
|
|
||||||
assert not URL_RE.match('ftp://127.0.0.1')
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
|||||||
from wordfreq_builder.ninja import make_ninja_deps
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('in_filename', help='filename of rules file')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Make the complete ninja file and write it to standard out
|
|
||||||
make_ninja_deps(args.in_filename)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,15 +0,0 @@
|
|||||||
from wordfreq_builder.word_counts import count_tokens, write_wordlist
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def handle_counts(filename_in, filename_out):
|
|
||||||
counts = count_tokens(filename_in)
|
|
||||||
write_wordlist(counts, filename_out)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
|
||||||
parser.add_argument('filename_out', help='name of output file')
|
|
||||||
args = parser.parse_args()
|
|
||||||
handle_counts(args.filename_in, args.filename_out)
|
|
@ -1,21 +0,0 @@
|
|||||||
"""
|
|
||||||
Count tokens of text in a particular language, taking input from a
|
|
||||||
tab-separated file whose first column is a language code. Lines in all
|
|
||||||
languages except the specified one will be skipped.
|
|
||||||
"""
|
|
||||||
from wordfreq_builder.word_counts import count_tokens_langtagged, write_wordlist
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def handle_counts(filename_in, filename_out, lang):
|
|
||||||
counts = count_tokens_langtagged(filename_in, lang)
|
|
||||||
write_wordlist(counts, filename_out)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
|
||||||
parser.add_argument('filename_out', help='name of output file')
|
|
||||||
parser.add_argument('-l', '--language', help='language tag to filter lines for')
|
|
||||||
args = parser.parse_args()
|
|
||||||
handle_counts(args.filename_in, args.filename_out, args.language)
|
|
@ -1,15 +0,0 @@
|
|||||||
from wordfreq_builder.word_counts import read_values, write_jieba
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def handle_counts(filename_in, filename_out):
|
|
||||||
freqs, total = read_values(filename_in, cutoff=1e-6)
|
|
||||||
write_jieba(freqs, filename_out)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('filename_in', help='name of input wordlist')
|
|
||||||
parser.add_argument('filename_out', help='name of output Jieba-compatible wordlist')
|
|
||||||
args = parser.parse_args()
|
|
||||||
handle_counts(args.filename_in, args.filename_out)
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user