diff --git a/tests/test.py b/tests/test.py index cabb8ac..9e1dca7 100644 --- a/tests/test.py +++ b/tests/test.py @@ -3,7 +3,7 @@ from wordfreq import ( top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize ) from nose.tools import ( - eq_, assert_almost_equal, assert_greater, raises + eq_, assert_almost_equal, assert_greater, raises, assert_not_equal ) @@ -41,9 +41,24 @@ LAUGHTER_WORDS = { def test_languages(): - # Make sure the number of available languages doesn't decrease + # Make sure we get all the languages when looking for the default + # 'best' wordlist avail = available_languages() - assert_greater(len(avail), 26) + assert_greater(len(avail), 32) + + # 'small' covers the same languages, but with some different lists + avail_small = available_languages('small') + eq_(len(avail_small), len(avail)) + assert_not_equal(avail_small, avail) + + # 'combined' is the same as 'small' + avail_old_name = available_languages('combined') + eq_(avail_old_name, avail_small) + + # 'large' covers fewer languages + avail_large = available_languages('large') + assert_greater(len(avail_large), 12) + assert_greater(len(avail), len(avail_large)) # Look up the digit '2' in the main word list for each language for lang in avail: @@ -55,17 +70,6 @@ def test_languages(): assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code) -def test_twitter(): - avail = available_languages('twitter') - assert_greater(len(avail), 15) - - for lang in avail: - assert_greater(word_frequency('rt', lang, 'twitter'), - word_frequency('rt', lang, 'combined')) - text = LAUGHTER_WORDS.get(lang, 'haha') - assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang)) - - def test_minimums(): eq_(word_frequency('esquivalience', 'en'), 0) eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 4f56003..99eba75 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -90,11 +90,21 @@ def read_cBpack(filename): return data[1:] -def available_languages(wordlist='combined'): +def available_languages(wordlist='best'): """ - List the languages (as language-code strings) that the wordlist of a given - name is available in. + Given a wordlist name, return a dictionary of language codes to filenames, + representing all the languages in which that wordlist is available. """ + if wordlist == 'best': + available = available_languages('small') + available.update(available_languages('large')) + return available + elif wordlist == 'combined': + logger.warning( + "The 'combined' wordlists have been renamed to 'small'." + ) + wordlist = 'small' + available = {} for path in DATA_PATH.glob('*.msgpack.gz'): if not path.name.startswith('_'): @@ -106,7 +116,7 @@ def available_languages(wordlist='combined'): @lru_cache(maxsize=None) -def get_frequency_list(lang, wordlist='combined', match_cutoff=30): +def get_frequency_list(lang, wordlist='best', match_cutoff=30): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) @@ -187,7 +197,7 @@ def freq_to_zipf(freq): @lru_cache(maxsize=None) -def get_frequency_dict(lang, wordlist='combined', match_cutoff=30): +def get_frequency_dict(lang, wordlist='best', match_cutoff=30): """ Get a word frequency list as a dictionary, mapping tokens to frequencies as floating-point probabilities. @@ -201,7 +211,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30): return freqs -def iter_wordlist(lang, wordlist='combined'): +def iter_wordlist(lang, wordlist='best'): """ Yield the words in a wordlist in approximate descending order of frequency. @@ -247,33 +257,18 @@ def _word_frequency(word, lang, wordlist, minimum): return max(freq, minimum) -def word_frequency(word, lang, wordlist='combined', minimum=0.): +def word_frequency(word, lang, wordlist='best', minimum=0.): """ Get the frequency of `word` in the language with code `lang`, from the - specified `wordlist`. The default wordlist is 'combined', built from - whichever of these five sources have sufficient data for the language: + specified `wordlist`. - - Full text of Wikipedia - - A sample of 72 million tweets collected from Twitter in 2014, - divided roughly into languages using automatic language detection - - Frequencies extracted from OpenSubtitles - - The Leeds Internet Corpus - - Google Books Syntactic Ngrams 2013 + These wordlists can be specified: - Another available wordlist is 'twitter', which uses only the data from - Twitter. - - Words that we believe occur at least once per million tokens, based on - the average of these lists, will appear in the word frequency list. - - The value returned will always be at least as large as `minimum`. - - If a word decomposes into multiple tokens, we'll return a smoothed estimate - of the word frequency that is no greater than the frequency of any of its - individual tokens. - - It should be noted that the current tokenizer does not support - multi-word Chinese phrases. + - 'large': a wordlist built from at least 5 sources, containing word + frequencies of 10^-8 and higher + - 'small': a wordlist built from at least 3 sources, containing word + frquencies of 10^-6 and higher + - 'best': uses 'large' if available, and 'small' otherwise """ args = (word, lang, wordlist, minimum) try: @@ -285,7 +280,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): return _wf_cache[args] -def zipf_frequency(word, lang, wordlist='combined', minimum=0.): +def zipf_frequency(word, lang, wordlist='best', minimum=0.): """ Get the frequency of `word`, in the language with code `lang`, on the Zipf scale. @@ -313,7 +308,7 @@ def zipf_frequency(word, lang, wordlist='combined', minimum=0.): @lru_cache(maxsize=100) -def top_n_list(lang, n, wordlist='combined', ascii_only=False): +def top_n_list(lang, n, wordlist='best', ascii_only=False): """ Return a frequency list of length `n` in descending order of frequency. This list contains words from `wordlist`, of the given language. @@ -328,7 +323,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False): return results -def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12, +def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12, ascii_only=False): """ Returns a string of random, space separated words. @@ -353,7 +348,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12, return ' '.join([random.choice(choices) for i in range(nwords)]) -def random_ascii_words(lang='en', wordlist='combined', nwords=5, +def random_ascii_words(lang='en', wordlist='best', nwords=5, bits_per_word=12): """ Returns a string of random, space separated, ASCII words. diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/small_ar.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ar.msgpack.gz rename to wordfreq/data/small_ar.msgpack.gz diff --git a/wordfreq/data/combined_bg.msgpack.gz b/wordfreq/data/small_bg.msgpack.gz similarity index 100% rename from wordfreq/data/combined_bg.msgpack.gz rename to wordfreq/data/small_bg.msgpack.gz diff --git a/wordfreq/data/combined_bn.msgpack.gz b/wordfreq/data/small_bn.msgpack.gz similarity index 100% rename from wordfreq/data/combined_bn.msgpack.gz rename to wordfreq/data/small_bn.msgpack.gz diff --git a/wordfreq/data/combined_ca.msgpack.gz b/wordfreq/data/small_ca.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ca.msgpack.gz rename to wordfreq/data/small_ca.msgpack.gz diff --git a/wordfreq/data/combined_cs.msgpack.gz b/wordfreq/data/small_cs.msgpack.gz similarity index 100% rename from wordfreq/data/combined_cs.msgpack.gz rename to wordfreq/data/small_cs.msgpack.gz diff --git a/wordfreq/data/combined_da.msgpack.gz b/wordfreq/data/small_da.msgpack.gz similarity index 100% rename from wordfreq/data/combined_da.msgpack.gz rename to wordfreq/data/small_da.msgpack.gz diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/small_de.msgpack.gz similarity index 100% rename from wordfreq/data/combined_de.msgpack.gz rename to wordfreq/data/small_de.msgpack.gz diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/small_el.msgpack.gz similarity index 100% rename from wordfreq/data/combined_el.msgpack.gz rename to wordfreq/data/small_el.msgpack.gz diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/small_en.msgpack.gz similarity index 100% rename from wordfreq/data/combined_en.msgpack.gz rename to wordfreq/data/small_en.msgpack.gz diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/small_es.msgpack.gz similarity index 100% rename from wordfreq/data/combined_es.msgpack.gz rename to wordfreq/data/small_es.msgpack.gz diff --git a/wordfreq/data/combined_fa.msgpack.gz b/wordfreq/data/small_fa.msgpack.gz similarity index 100% rename from wordfreq/data/combined_fa.msgpack.gz rename to wordfreq/data/small_fa.msgpack.gz diff --git a/wordfreq/data/combined_fi.msgpack.gz b/wordfreq/data/small_fi.msgpack.gz similarity index 100% rename from wordfreq/data/combined_fi.msgpack.gz rename to wordfreq/data/small_fi.msgpack.gz diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/small_fr.msgpack.gz similarity index 100% rename from wordfreq/data/combined_fr.msgpack.gz rename to wordfreq/data/small_fr.msgpack.gz diff --git a/wordfreq/data/combined_he.msgpack.gz b/wordfreq/data/small_he.msgpack.gz similarity index 100% rename from wordfreq/data/combined_he.msgpack.gz rename to wordfreq/data/small_he.msgpack.gz diff --git a/wordfreq/data/combined_hi.msgpack.gz b/wordfreq/data/small_hi.msgpack.gz similarity index 100% rename from wordfreq/data/combined_hi.msgpack.gz rename to wordfreq/data/small_hi.msgpack.gz diff --git a/wordfreq/data/combined_hu.msgpack.gz b/wordfreq/data/small_hu.msgpack.gz similarity index 100% rename from wordfreq/data/combined_hu.msgpack.gz rename to wordfreq/data/small_hu.msgpack.gz diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/small_id.msgpack.gz similarity index 100% rename from wordfreq/data/combined_id.msgpack.gz rename to wordfreq/data/small_id.msgpack.gz diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/small_it.msgpack.gz similarity index 100% rename from wordfreq/data/combined_it.msgpack.gz rename to wordfreq/data/small_it.msgpack.gz diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/small_ja.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ja.msgpack.gz rename to wordfreq/data/small_ja.msgpack.gz diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/small_ko.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ko.msgpack.gz rename to wordfreq/data/small_ko.msgpack.gz diff --git a/wordfreq/data/combined_mk.msgpack.gz b/wordfreq/data/small_mk.msgpack.gz similarity index 100% rename from wordfreq/data/combined_mk.msgpack.gz rename to wordfreq/data/small_mk.msgpack.gz diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/small_ms.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ms.msgpack.gz rename to wordfreq/data/small_ms.msgpack.gz diff --git a/wordfreq/data/combined_nb.msgpack.gz b/wordfreq/data/small_nb.msgpack.gz similarity index 100% rename from wordfreq/data/combined_nb.msgpack.gz rename to wordfreq/data/small_nb.msgpack.gz diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/small_nl.msgpack.gz similarity index 100% rename from wordfreq/data/combined_nl.msgpack.gz rename to wordfreq/data/small_nl.msgpack.gz diff --git a/wordfreq/data/combined_pl.msgpack.gz b/wordfreq/data/small_pl.msgpack.gz similarity index 100% rename from wordfreq/data/combined_pl.msgpack.gz rename to wordfreq/data/small_pl.msgpack.gz diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/small_pt.msgpack.gz similarity index 100% rename from wordfreq/data/combined_pt.msgpack.gz rename to wordfreq/data/small_pt.msgpack.gz diff --git a/wordfreq/data/combined_ro.msgpack.gz b/wordfreq/data/small_ro.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ro.msgpack.gz rename to wordfreq/data/small_ro.msgpack.gz diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/small_ru.msgpack.gz similarity index 100% rename from wordfreq/data/combined_ru.msgpack.gz rename to wordfreq/data/small_ru.msgpack.gz diff --git a/wordfreq/data/combined_sh.msgpack.gz b/wordfreq/data/small_sh.msgpack.gz similarity index 100% rename from wordfreq/data/combined_sh.msgpack.gz rename to wordfreq/data/small_sh.msgpack.gz diff --git a/wordfreq/data/combined_sv.msgpack.gz b/wordfreq/data/small_sv.msgpack.gz similarity index 100% rename from wordfreq/data/combined_sv.msgpack.gz rename to wordfreq/data/small_sv.msgpack.gz diff --git a/wordfreq/data/combined_tr.msgpack.gz b/wordfreq/data/small_tr.msgpack.gz similarity index 100% rename from wordfreq/data/combined_tr.msgpack.gz rename to wordfreq/data/small_tr.msgpack.gz diff --git a/wordfreq/data/combined_uk.msgpack.gz b/wordfreq/data/small_uk.msgpack.gz similarity index 100% rename from wordfreq/data/combined_uk.msgpack.gz rename to wordfreq/data/small_uk.msgpack.gz diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/small_zh.msgpack.gz similarity index 100% rename from wordfreq/data/combined_zh.msgpack.gz rename to wordfreq/data/small_zh.msgpack.gz diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz deleted file mode 100644 index cb5833f..0000000 Binary files a/wordfreq/data/twitter_ar.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_bg.msgpack.gz b/wordfreq/data/twitter_bg.msgpack.gz deleted file mode 100644 index dca1a53..0000000 Binary files a/wordfreq/data/twitter_bg.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_bn.msgpack.gz b/wordfreq/data/twitter_bn.msgpack.gz deleted file mode 100644 index 2d9c2e9..0000000 Binary files a/wordfreq/data/twitter_bn.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_ca.msgpack.gz b/wordfreq/data/twitter_ca.msgpack.gz deleted file mode 100644 index 0ac8477..0000000 Binary files a/wordfreq/data/twitter_ca.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_cs.msgpack.gz b/wordfreq/data/twitter_cs.msgpack.gz deleted file mode 100644 index a79cb61..0000000 Binary files a/wordfreq/data/twitter_cs.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_da.msgpack.gz b/wordfreq/data/twitter_da.msgpack.gz deleted file mode 100644 index 392b1d9..0000000 Binary files a/wordfreq/data/twitter_da.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz deleted file mode 100644 index 04b0f55..0000000 Binary files a/wordfreq/data/twitter_de.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz deleted file mode 100644 index 604df8e..0000000 Binary files a/wordfreq/data/twitter_en.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz deleted file mode 100644 index 03ad4d9..0000000 Binary files a/wordfreq/data/twitter_es.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_fa.msgpack.gz b/wordfreq/data/twitter_fa.msgpack.gz deleted file mode 100644 index 008098a..0000000 Binary files a/wordfreq/data/twitter_fa.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_fi.msgpack.gz b/wordfreq/data/twitter_fi.msgpack.gz deleted file mode 100644 index 221d599..0000000 Binary files a/wordfreq/data/twitter_fi.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz deleted file mode 100644 index 5f59122..0000000 Binary files a/wordfreq/data/twitter_fr.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_he.msgpack.gz b/wordfreq/data/twitter_he.msgpack.gz deleted file mode 100644 index 2bb0363..0000000 Binary files a/wordfreq/data/twitter_he.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_hi.msgpack.gz b/wordfreq/data/twitter_hi.msgpack.gz deleted file mode 100644 index ee8df85..0000000 Binary files a/wordfreq/data/twitter_hi.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_hu.msgpack.gz b/wordfreq/data/twitter_hu.msgpack.gz deleted file mode 100644 index cddde5d..0000000 Binary files a/wordfreq/data/twitter_hu.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz deleted file mode 100644 index a7f020e..0000000 Binary files a/wordfreq/data/twitter_id.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz deleted file mode 100644 index 2c2ee97..0000000 Binary files a/wordfreq/data/twitter_it.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz deleted file mode 100644 index 2d398de..0000000 Binary files a/wordfreq/data/twitter_ja.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz deleted file mode 100644 index 07ab5ce..0000000 Binary files a/wordfreq/data/twitter_ko.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz deleted file mode 100644 index 31a12ad..0000000 Binary files a/wordfreq/data/twitter_ms.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_nb.msgpack.gz b/wordfreq/data/twitter_nb.msgpack.gz deleted file mode 100644 index 542ba22..0000000 Binary files a/wordfreq/data/twitter_nb.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz deleted file mode 100644 index d6821af..0000000 Binary files a/wordfreq/data/twitter_nl.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_pl.msgpack.gz b/wordfreq/data/twitter_pl.msgpack.gz deleted file mode 100644 index dde20bb..0000000 Binary files a/wordfreq/data/twitter_pl.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz deleted file mode 100644 index f6e3f7e..0000000 Binary files a/wordfreq/data/twitter_pt.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_ro.msgpack.gz b/wordfreq/data/twitter_ro.msgpack.gz deleted file mode 100644 index d91122c..0000000 Binary files a/wordfreq/data/twitter_ro.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz deleted file mode 100644 index b60cf65..0000000 Binary files a/wordfreq/data/twitter_ru.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_sh.msgpack.gz b/wordfreq/data/twitter_sh.msgpack.gz deleted file mode 100644 index b2d4b88..0000000 Binary files a/wordfreq/data/twitter_sh.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_sv.msgpack.gz b/wordfreq/data/twitter_sv.msgpack.gz deleted file mode 100644 index c4b1103..0000000 Binary files a/wordfreq/data/twitter_sv.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_tr.msgpack.gz b/wordfreq/data/twitter_tr.msgpack.gz deleted file mode 100644 index 824a1d4..0000000 Binary files a/wordfreq/data/twitter_tr.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/twitter_uk.msgpack.gz b/wordfreq/data/twitter_uk.msgpack.gz deleted file mode 100644 index 62b9ef8..0000000 Binary files a/wordfreq/data/twitter_uk.msgpack.gz and /dev/null differ