diff --git a/.gitignore b/.gitignore index 975f163..a68e8ca 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ pip-log.txt .coverage *~ wordfreq-data.tar.gz +.idea +build.dot diff --git a/README.md b/README.md index c0eb421..0bba163 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ install them on Ubuntu: ## Usage wordfreq provides access to estimates of the frequency with which a word is -used, in 15 languages (see *Supported languages* below). It loads +used, in 16 languages (see *Supported languages* below). It loads efficiently-packed data structures that contain all words that appear at least once per million words. @@ -118,34 +118,38 @@ of word usage on different topics at different levels of formality. The sources - **GBooks**: Google Books Ngrams 2013 - **LeedsIC**: The Leeds Internet Corpus - **OpenSub**: OpenSubtitles +- **SUBTLEX**: The SUBTLEX word frequency lists - **Twitter**: Messages sampled from Twitter's public stream - **Wikipedia**: The full text of Wikipedia in 2015 -The following 12 languages are well-supported, using at least 3 different sources -of word frequencies: +The following 14 languages are well-supported, with reasonable tokenization and +at least 3 different sources of word frequencies: - Language Code GBooks LeedsIC OpenSub Twitter Wikipedia - ──────────────────┼────────────────────────────────────────── - Arabic ar │ - Yes Yes Yes Yes - German de │ - Yes Yes Yes[1] Yes - English en │ Yes Yes Yes Yes Yes - Spanish es │ - Yes Yes Yes Yes - French fr │ - Yes Yes Yes Yes - Indonesian id │ - - Yes Yes Yes - Italian it │ - Yes Yes Yes Yes - Japanese ja │ - Yes - Yes Yes - Malay ms │ - - Yes Yes Yes - Dutch nl │ - - Yes Yes Yes - Portuguese pt │ - Yes Yes Yes Yes - Russian ru │ - Yes Yes Yes Yes + Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia + ──────────────────┼────────────────────────────────────────────────── + Arabic ar │ - - Yes Yes Yes Yes + German de │ - Yes Yes - Yes[1] Yes + Greek el │ - - Yes Yes Yes Yes + English en │ Yes Yes Yes Yes Yes Yes + Spanish es │ - - Yes Yes Yes Yes + French fr │ - - Yes Yes Yes Yes + Indonesian id │ - - - Yes Yes Yes + Italian it │ - - Yes Yes Yes Yes + Japanese ja │ - - Yes - Yes Yes + Malay ms │ - - - Yes Yes Yes + Dutch nl │ - Yes - Yes Yes Yes + Portuguese pt │ - - Yes Yes Yes Yes + Russian ru │ - - Yes Yes Yes Yes + Turkish tr │ - - - Yes Yes Yes -These 3 languages are only marginally supported so far: +These languages are only marginally supported so far. We have too few data +sources so far in Korean (feel free to suggest some), and we are lacking +tokenization support for Chinese. - Language Code GBooks LeedsIC OpenSub Twitter Wikipedia - ──────────────────┼────────────────────────────────────────── - Greek el │ - Yes Yes - - - Korean ko │ - - - Yes Yes - Chinese zh │ - Yes Yes - - + Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia + ──────────────────┼────────────────────────────────────────────────── + Korean ko │ - - - - Yes Yes + Chinese zh │ - Yes Yes Yes - - [1] We've counted the frequencies from tweets in German, such as they are, but you should be aware that German is not a frequently-used language on Twitter. @@ -219,7 +223,58 @@ sources: - Wikipedia, the free encyclopedia (http://www.wikipedia.org) +It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and +SUBTLEX-CH, created by Marc Brysbaert et al. and available at +http://crr.ugent.be/programs-data/subtitle-frequencies. + +I (Rob Speer) have +obtained permission by e-mail from Marc Brysbaert to distribute these wordlists +in wordfreq, to be used for any purpose, not just for academic use, under these +conditions: + +- Wordfreq and code derived from it must credit the SUBTLEX authors. +- It must remain clear that SUBTLEX is freely available data. + +These terms are similar to the Creative Commons Attribution-ShareAlike license. + Some additional data was collected by a custom application that watches the streaming Twitter API, in accordance with Twitter's Developer Agreement & Policy. This software gives statistics about words that are commonly used on Twitter; it does not display or republish any Twitter content. + +## Citations to work that wordfreq is built on + +- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical + Evaluation of Current Word Frequency Norms and the Introduction of a New and + Improved Word Frequency Measure for American English. Behavior Research + Methods, 41 (4), 977-990. + http://sites.google.com/site/borisnew/pub/BrysbaertNew2009.pdf + +- Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A. M., Bölte, J., & Böhl, A. + (2015). The word frequency effect. Experimental Psychology. + http://econtent.hogrefe.com/doi/abs/10.1027/1618-3169/a000123?journalCode=zea + +- Cai, Q., & Brysbaert, M. (2010). SUBTLEX-CH: Chinese word and character + frequencies based on film subtitles. PLoS One, 5(6), e10729. + http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729 + +- Dave, H. (2011). Frequency word lists. + https://invokeit.wordpress.com/frequency-word-lists/ + +- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29. + http://unicode.org/reports/tr29/ + +- Keuleers, E., Brysbaert, M. & New, B. (2010). SUBTLEX-NL: A new frequency + measure for Dutch words based on film subtitles. Behavior Research Methods, + 42(3), 643-650. + http://crr.ugent.be/papers/SUBTLEX-NL_BRM.pdf + +- Kudo, T. (2005). Mecab: Yet another part-of-speech and morphological + analyzer. + http://mecab.sourceforge.net/ + +- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). + SUBTLEX-UK: A new and improved word frequency database for British English. + The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190. + http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521 + diff --git a/scripts/ninja2dot.py b/scripts/ninja2dot.py index 42b5362..f73131c 100644 --- a/scripts/ninja2dot.py +++ b/scripts/ninja2dot.py @@ -1,30 +1,39 @@ """ This file generates a graph of the dependencies for the ninja build.""" import sys +import re def ninja_to_dot(): - def last_component(path): - return path.split('/')[-1] + def simplified_filename(path): + component = path.split('/')[-1] + return re.sub( + r'[0-9]+-of', 'NN-of', + re.sub(r'part[0-9]+', 'partNN', component) + ) print("digraph G {") print('rankdir="LR";') + seen_edges = set() for line in sys.stdin: line = line.rstrip() if line.startswith('build'): # the output file is the first argument; strip off the colon that # comes from ninja syntax output_text, input_text = line.split(':') - outfiles = [last_component(part) for part in output_text.split(' ')[1:]] + outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]] inputs = input_text.strip().split(' ') - infiles = [last_component(part) for part in inputs[1:]] + infiles = [simplified_filename(part) for part in inputs[1:]] operation = inputs[0] for infile in infiles: if infile == '|': # external dependencies start here; let's not graph those break for outfile in outfiles: - print('"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)) + edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation) + if edge not in seen_edges: + seen_edges.add(edge) + print(edge) print("}") diff --git a/tests/test.py b/tests/test.py index 0a8e212..21dd9ad 100644 --- a/tests/test.py +++ b/tests/test.py @@ -19,7 +19,7 @@ def test_freq_examples(): def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() - assert_greater(len(avail), 14) + assert_greater(len(avail), 15) # Laughter is the universal language for lang in avail: @@ -36,7 +36,7 @@ def test_languages(): def test_twitter(): avail = available_languages('twitter') - assert_greater(len(avail), 12) + assert_greater(len(avail), 14) for lang in avail: assert_greater(word_frequency('rt', lang, 'twitter'), @@ -68,6 +68,7 @@ def test_most_common_words(): eq_(get_most_common('nl'), 'de') eq_(get_most_common('pt'), 'de') eq_(get_most_common('ru'), 'в') + eq_(get_most_common('tr'), 'bir') eq_(get_most_common('zh'), '的') @@ -111,6 +112,8 @@ def test_tokenization(): def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) eq_(tokenize('weiß', 'de'), ['weiss']) + eq_(tokenize('İstanbul', 'tr'), ['istanbul']) + eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca']) def test_phrase_freq(): diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz index 489af0b..024d15a 100644 Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz index 417253a..01b582a 100644 Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz index da19b35..de5fc2a 100644 Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz index 32e455e..fa8a955 100644 Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz index 43a1ea4..5edb142 100644 Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz index e337d96..3d12c37 100644 Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz index 83ac294..611d7e9 100644 Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz index f357cfa..9480331 100644 Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz index e06c840..7668e78 100644 Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz index fed4292..1424631 100644 Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz index 264612f..f4355ea 100644 Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz index 33dda68..3a20c21 100644 Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz index d63551f..49548be 100644 Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz index c4585cd..9bf91ab 100644 Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ diff --git a/wordfreq/data/combined_tr.msgpack.gz b/wordfreq/data/combined_tr.msgpack.gz new file mode 100644 index 0000000..e0feca0 Binary files /dev/null and b/wordfreq/data/combined_tr.msgpack.gz differ diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz index 0e9581b..c16cfbc 100644 Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz index 830f7a2..7983403 100644 Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz index d6bbc83..e47744c 100644 Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ diff --git a/wordfreq/data/twitter_el.msgpack.gz b/wordfreq/data/twitter_el.msgpack.gz new file mode 100644 index 0000000..bdf5d40 Binary files /dev/null and b/wordfreq/data/twitter_el.msgpack.gz differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz index d305533..f9e2299 100644 Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz index fb03dcc..a76fedc 100644 Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz index 0540be2..fbd4a6b 100644 Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz index 3295083..0f25751 100644 Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz index 40b1bd8..fad7127 100644 Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz index 9826353..7196ff0 100644 Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz index cab27b3..cb5c2c2 100644 Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz index 0b422c5..e36090b 100644 Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz index 015db77..7d99d85 100644 Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz index bd663ae..2749a10 100644 Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz index 395018b..56c2fc9 100644 Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ diff --git a/wordfreq/data/twitter_tr.msgpack.gz b/wordfreq/data/twitter_tr.msgpack.gz new file mode 100644 index 0000000..7edc781 Binary files /dev/null and b/wordfreq/data/twitter_tr.msgpack.gz differ diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index eb2c631..e33ca1d 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -65,6 +65,15 @@ def simple_tokenize(text): return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] +def turkish_tokenize(text): + """ + Like `simple_tokenize`, but modifies i's so that they case-fold correctly + in Turkish. + """ + text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') + return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] + + def remove_arabic_marks(text): """ Remove decorations from Arabic words: @@ -90,6 +99,8 @@ def tokenize(text, lang): - Chinese or Japanese texts that aren't identified as the appropriate language will only split on punctuation and script boundaries, giving you untokenized globs of characters that probably represent many words. + - Turkish will use a different case-folding procedure, so that capital + I and İ map to ı and i respectively. - All other languages will be tokenized using a regex that mostly implements the Word Segmentation section of Unicode Annex #29. See `simple_tokenize` for details. @@ -107,6 +118,9 @@ def tokenize(text, lang): from wordfreq.mecab import mecab_tokenize return mecab_tokenize(text) + if lang == 'tr': + return turkish_tokenize(text) + if lang == 'ar': text = remove_arabic_marks(unicodedata.normalize('NFKC', text)) diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md index 2aedf27..af47613 100644 --- a/wordfreq_builder/README.md +++ b/wordfreq_builder/README.md @@ -161,3 +161,34 @@ longer represents the words 'don' and 'won', as we assume most of their frequency comes from "don't" and "won't". Words that turned into similarly common words, however, were left alone: this list doesn't represent "can't" because the word was left as "can". + +### SUBTLEX + +Marc Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in +wordfreq and derived works without the "academic use" restriction, under the +following reasonable conditions: + +- Wordfreq and code derived from it must credit the SUBTLEX authors. + (See the citations in the top-level `README.md` file.) +- It must remain clear that SUBTLEX is freely available data. + +`data/source-lists/subtlex` contains the following files: + +- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de], + and exported from Excel format to tab-separated UTF-8 using LibreOffice +- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us], + extracted, and converted from ISO-8859-1 to UTF-8 +- `subtlex.en-GB.txt`, which was downloaded as + [SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to + tab-separated UTF-8 using LibreOffice +- `subtlex.nl.txt`, which was downloaded as + [SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted +- `subtlex.zh.txt`, which was downloaded as + [subtlexch131210.zip][subtlex-ch] and extracted + +[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx +[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip +[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx +[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip +[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip + diff --git a/wordfreq_builder/build.png b/wordfreq_builder/build.png index ef54b21..15635c6 100644 Binary files a/wordfreq_builder/build.png and b/wordfreq_builder/build.png differ diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index b708533..986678c 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -56,6 +56,12 @@ rule convert_leeds rule convert_opensubtitles command = tr ' ' ',' < $in > $out +# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, +# run it through ftfy, convert tabs to commas and spurious CSV formatting to +# and remove lines with unfixable half-mojibake. +rule convert_subtlex + command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out + # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all # the input files, keep only the single words and their counts, and only keep # lines with counts of 100 or more. @@ -71,7 +77,10 @@ rule count command = python -m wordfreq_builder.cli.count_tokens $in $out rule merge - command = python -m wordfreq_builder.cli.combine_lists -o $out $in + command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in + +rule merge_counts + command = python -m wordfreq_builder.cli.merge_counts -o $out $in rule freqs2cB command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py similarity index 66% rename from wordfreq_builder/wordfreq_builder/cli/combine_lists.py rename to wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 61d4b1d..5e3de69 100644 --- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -1,12 +1,13 @@ -from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist +from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist import argparse def merge_lists(input_names, output_name): - freq_dicts = [] + count_dicts = [] for input_name in input_names: - freq_dicts.append(read_freqs(input_name, cutoff=2)) - merged = merge_freqs(freq_dicts) + values, total = read_values(input_name, cutoff=0) + count_dicts.append(values) + merged = merge_counts(count_dicts) write_wordlist(merged, output_name) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py new file mode 100644 index 0000000..0bbe1c1 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py @@ -0,0 +1,20 @@ +from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist +import argparse + + +def merge_lists(input_names, output_name, cutoff): + freq_dicts = [] + for input_name in input_names: + freq_dicts.append(read_freqs(input_name, cutoff=cutoff)) + merged = merge_freqs(freq_dicts) + write_wordlist(merged, output_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') + parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) + parser.add_argument('inputs', help='names of input files to merge', nargs='+') + args = parser.parse_args() + merge_lists(args.inputs, args.output, args.cutoff) + diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index a80c327..dc61bc6 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -8,20 +8,25 @@ CONFIG = { 'sources': { # A list of language codes (possibly un-standardized) that we'll # look up in filenames for these various data sources. + # + # Consider adding: + # 'th' when we get tokenization for it + # 'hi' when we stop messing up its tokenization + # 'tl' because it's probably ready right now + # 'pl' because we have 3 sources for it 'twitter': [ - 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', - 'pt', 'ru', - # can be added later: 'th', 'tr' + 'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru', 'tr' ], 'wikipedia': [ - 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', - 'pt', 'ru' - # many more can be added + 'ar', 'de', 'en', 'el', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru', 'tr' ], 'opensubtitles': [ - # All languages where the most common word in OpenSubtitles - # appears at least 5000 times - 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', + # This list includes languages where the most common word in + # OpenSubtitles appears at least 5000 times. However, we exclude + # German, where SUBTLEX has done better processing of the same data. + 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'el', 'en', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv', 'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'tr', 'uk', 'zh' @@ -33,14 +38,19 @@ CONFIG = { 'en', # Using the 2012 data, we could get French, German, Italian, # Russian, Spanish, and (Simplified) Chinese. - ] + ], + 'subtlex-en': ['en'], + 'subtlex-other': ['de', 'nl', 'zh'], }, + # Subtlex languages that need to be pre-processed 'wordlist_paths': { 'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}', 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', 'google-books': 'generated/google-books/google_books_{lang}.{ext}', + 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', + 'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}' diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 84c1818..293eb0d 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -5,7 +5,8 @@ import sys import pathlib HEADER = """# This file is automatically generated. Do not edit it. -# You can regenerate it using the 'wordfreq-build-deps' command. +# You can change its behavior by editing wordfreq_builder/ninja.py, +# and regenerate it by running 'make'. """ TMPDIR = data_filename('tmp') @@ -76,6 +77,18 @@ def make_ninja_deps(rules_filename, out=sys.stdout): CONFIG['sources']['opensubtitles'] ) ) + lines.extend( + subtlex_en_deps( + data_filename('source-lists/subtlex'), + CONFIG['sources']['subtlex-en'] + ) + ) + lines.extend( + subtlex_other_deps( + data_filename('source-lists/subtlex'), + CONFIG['sources']['subtlex-other'] + ) + ) lines.extend(combine_lists(all_languages())) print('\n'.join(lines), file=out) @@ -140,7 +153,8 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, for language in languages ] add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, - params={'prefix': slice_file}) + params={'prefix': slice_file}, + extra='wordfreq_builder/tokenizers.py') for language in languages: combined_output = wordlist_filename('twitter', language, 'tokens.txt') @@ -188,12 +202,69 @@ def opensubtitles_deps(dirname_in, languages): prefix=dirname_in, lang=language ) reformatted_file = wordlist_filename( - 'opensubtitles', language, 'counts.txt') + 'opensubtitles', language, 'counts.txt' + ) add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) return lines +# Which columns of the SUBTLEX data files do the word and its frequency appear +# in? +SUBTLEX_COLUMN_MAP = { + 'de': (1, 3), + 'el': (2, 3), + 'en': (1, 2), + 'nl': (1, 2), + 'zh': (1, 5) +} + + +def subtlex_en_deps(dirname_in, languages): + lines = [] + assert languages == ['en'] + regions = ['en-US', 'en-GB'] + processed_files = [] + for region in regions: + input_file = '{prefix}/subtlex.{region}.txt'.format( + prefix=dirname_in, region=region + ) + textcol, freqcol = SUBTLEX_COLUMN_MAP['en'] + processed_file = wordlist_filename('subtlex-en', region, 'processed.txt') + processed_files.append(processed_file) + add_dep( + lines, 'convert_subtlex', input_file, processed_file, + params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} + ) + + output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') + add_dep(lines, 'merge_counts', processed_files, output_file) + + return lines + + +def subtlex_other_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/subtlex.{lang}.txt'.format( + prefix=dirname_in, lang=language + ) + processed_file = wordlist_filename('subtlex-other', language, 'processed.txt') + output_file = wordlist_filename('subtlex-other', language, 'counts.txt') + textcol, freqcol = SUBTLEX_COLUMN_MAP[language] + + # Skip one header line by setting 'startrow' to 2 (because tail is 1-based). + # I hope we don't need to configure this by language anymore. + add_dep( + lines, 'convert_subtlex', input_file, processed_file, + params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} + ) + add_dep( + lines, 'merge_counts', processed_file, output_file + ) + return lines + + def combine_lists(languages): lines = [] for language in languages: @@ -204,7 +275,8 @@ def combine_lists(languages): ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, - extra='wordfreq_builder/word_counts.py') + extra='wordfreq_builder/word_counts.py', + params={'cutoff': 2}) output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz') diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 92d0714..1a75626 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -13,7 +13,8 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( '\ufdd0-\ufdef', '\N{HANGUL FILLER}', '\N{HANGUL CHOSEONG FILLER}', - '\N{HANGUL JUNGSEONG FILLER}' + '\N{HANGUL JUNGSEONG FILLER}', + '<>' ] + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] ) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 9da95a3..1933295 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -32,9 +32,40 @@ def count_tokens(filename): return counts +def read_values(filename, cutoff=0, lang=None): + """ + Read words and their frequency or count values from a CSV file. Returns + a dictionary of values and the total of all values. + + Only words with a value greater than or equal to `cutoff` are returned. + + If `cutoff` is greater than 0, the csv file must be sorted by value + in descending order. + + If lang is given, it will apply language specific preprocessing + operations. + """ + values = defaultdict(float) + total = 0. + with open(filename, encoding='utf-8', newline='') as infile: + for key, strval in csv.reader(infile): + val = float(strval) + key = fix_text(key) + if val < cutoff: + break + tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) + for token in tokens: + # Use += so that, if we give the reader concatenated files with + # duplicates, it does the right thing + values[token] += val + total += val + return values, total + + def read_freqs(filename, cutoff=0, lang=None): """ - Read words and their frequencies from a CSV file. + Read words and their frequencies from a CSV file, normalizing the + frequencies to add up to 1. Only words with a frequency greater than or equal to `cutoff` are returned. @@ -44,24 +75,11 @@ def read_freqs(filename, cutoff=0, lang=None): If lang is given, read_freqs will apply language specific preprocessing operations. """ - raw_counts = defaultdict(float) - total = 0. - with open(filename, encoding='utf-8', newline='') as infile: - for key, strval in csv.reader(infile): - val = float(strval) - if val < cutoff: - break - tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) - for token in tokens: - # Use += so that, if we give the reader concatenated files with - # duplicates, it does the right thing - raw_counts[fix_text(token)] += val - total += val + values, total = read_values(filename, cutoff, lang) + for word in values: + values[word] /= total - for word in raw_counts: - raw_counts[word] /= total - - return raw_counts + return values def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): @@ -96,6 +114,17 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): msgpack.dump(cBpack_data, outfile) +def merge_counts(count_dicts): + """ + Merge multiple dictionaries of counts by adding their entries. + """ + merged = defaultdict(int) + for count_dict in count_dicts: + for term, count in count_dict.items(): + merged[term] += count + return merged + + def merge_freqs(freq_dicts): """ Merge multiple dictionaries of frequencies, representing each word with