reorganize wordlists into 'small', 'large', and 'best'

2024-12-23 09:21:37 +00:00 · 2018-03-08 17:52:44 -05:00 · 2018-03-08 17:52:44 -05:00 · 45064a292f
commit 45064a292f
parent fe85b4e124
65 changed files with 46 additions and 47 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -3,7 +3,7 @@ from wordfreq import (
    top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
 )
 from nose.tools import (
-    eq_, assert_almost_equal, assert_greater, raises
+    eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
 )


@ -41,9 +41,24 @@ LAUGHTER_WORDS = {


 def test_languages():
-    # Make sure the number of available languages doesn't decrease
+    # Make sure we get all the languages when looking for the default
+    # 'best' wordlist
    avail = available_languages()
-    assert_greater(len(avail), 26)
+    assert_greater(len(avail), 32)
+
+    # 'small' covers the same languages, but with some different lists
+    avail_small = available_languages('small')
+    eq_(len(avail_small), len(avail))
+    assert_not_equal(avail_small, avail)
+
+    # 'combined' is the same as 'small'
+    avail_old_name = available_languages('combined')
+    eq_(avail_old_name, avail_small)
+
+    # 'large' covers fewer languages
+    avail_large = available_languages('large')
+    assert_greater(len(avail_large), 12)
+    assert_greater(len(avail), len(avail_large))

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
@ -55,17 +70,6 @@ def test_languages():
        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)


-def test_twitter():
-    avail = available_languages('twitter')
-    assert_greater(len(avail), 15)
-
-    for lang in avail:
-        assert_greater(word_frequency('rt', lang, 'twitter'),
-                       word_frequency('rt', lang, 'combined'))
-        text = LAUGHTER_WORDS.get(lang, 'haha')
-        assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
-
-
 def test_minimums():
    eq_(word_frequency('esquivalience', 'en'), 0)
    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -90,11 +90,21 @@ def read_cBpack(filename):
    return data[1:]


-def available_languages(wordlist='combined'):
+def available_languages(wordlist='best'):
    """
-    List the languages (as language-code strings) that the wordlist of a given
-    name is available in.
+    Given a wordlist name, return a dictionary of language codes to filenames,
+    representing all the languages in which that wordlist is available.
    """
+    if wordlist == 'best':
+        available = available_languages('small')
+        available.update(available_languages('large'))
+        return available
+    elif wordlist == 'combined':
+        logger.warning(
+            "The 'combined' wordlists have been renamed to 'small'."
+        )
+        wordlist = 'small'
+
    available = {}
    for path in DATA_PATH.glob('*.msgpack.gz'):
        if not path.name.startswith('_'):
@ -106,7 +116,7 @@ def available_languages(wordlist='combined'):


@lru_cache(maxsize=None)
-def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
+def get_frequency_list(lang, wordlist='best', match_cutoff=30):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)
@ -187,7 +197,7 @@ def freq_to_zipf(freq):


@lru_cache(maxsize=None)
-def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
+def get_frequency_dict(lang, wordlist='best', match_cutoff=30):
    """
    Get a word frequency list as a dictionary, mapping tokens to
    frequencies as floating-point probabilities.
@ -201,7 +211,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
    return freqs


-def iter_wordlist(lang, wordlist='combined'):
+def iter_wordlist(lang, wordlist='best'):
    """
    Yield the words in a wordlist in approximate descending order of
    frequency.
@ -247,33 +257,18 @@ def _word_frequency(word, lang, wordlist, minimum):
    return max(freq, minimum)


-def word_frequency(word, lang, wordlist='combined', minimum=0.):
+def word_frequency(word, lang, wordlist='best', minimum=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
-    specified `wordlist`. The default wordlist is 'combined', built from
-    whichever of these five sources have sufficient data for the language:
+    specified `wordlist`.

-      - Full text of Wikipedia
-      - A sample of 72 million tweets collected from Twitter in 2014,
-        divided roughly into languages using automatic language detection
-      - Frequencies extracted from OpenSubtitles
-      - The Leeds Internet Corpus
-      - Google Books Syntactic Ngrams 2013
+    These wordlists can be specified:

-    Another available wordlist is 'twitter', which uses only the data from
-    Twitter.
-
-    Words that we believe occur at least once per million tokens, based on
-    the average of these lists, will appear in the word frequency list.
-
-    The value returned will always be at least as large as `minimum`.
-
-    If a word decomposes into multiple tokens, we'll return a smoothed estimate
-    of the word frequency that is no greater than the frequency of any of its
-    individual tokens.
-
-    It should be noted that the current tokenizer does not support
-    multi-word Chinese phrases.
+    - 'large': a wordlist built from at least 5 sources, containing word
+      frequencies of 10^-8 and higher
+    - 'small': a wordlist built from at least 3 sources, containing word
+      frquencies of 10^-6 and higher
+    - 'best': uses 'large' if available, and 'small' otherwise
    """
    args = (word, lang, wordlist, minimum)
    try:
@ -285,7 +280,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
        return _wf_cache[args]


-def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
+def zipf_frequency(word, lang, wordlist='best', minimum=0.):
    """
    Get the frequency of `word`, in the language with code `lang`, on the Zipf
    scale.
@ -313,7 +308,7 @@ def zipf_frequency(word, lang, wordlist='combined', minimum=0.):


@lru_cache(maxsize=100)
-def top_n_list(lang, n, wordlist='combined', ascii_only=False):
+def top_n_list(lang, n, wordlist='best', ascii_only=False):
    """
    Return a frequency list of length `n` in descending order of frequency.
    This list contains words from `wordlist`, of the given language.
@ -328,7 +323,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False):
    return results


-def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
+def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
                 ascii_only=False):
    """
    Returns a string of random, space separated words.
@ -353,7 +348,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
    return ' '.join([random.choice(choices) for i in range(nwords)])


-def random_ascii_words(lang='en', wordlist='combined', nwords=5,
+def random_ascii_words(lang='en', wordlist='best', nwords=5,
                       bits_per_word=12):
    """
    Returns a string of random, space separated, ASCII words.
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_bg.msgpack.gz
+++ b/wordfreq/data/combined_bg.msgpack.gz
--- a/wordfreq/data/combined_bn.msgpack.gz
+++ b/wordfreq/data/combined_bn.msgpack.gz
--- a/wordfreq/data/combined_ca.msgpack.gz
+++ b/wordfreq/data/combined_ca.msgpack.gz
--- a/wordfreq/data/combined_cs.msgpack.gz
+++ b/wordfreq/data/combined_cs.msgpack.gz
--- a/wordfreq/data/combined_da.msgpack.gz
+++ b/wordfreq/data/combined_da.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fa.msgpack.gz
+++ b/wordfreq/data/combined_fa.msgpack.gz
--- a/wordfreq/data/combined_fi.msgpack.gz
+++ b/wordfreq/data/combined_fi.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_he.msgpack.gz
+++ b/wordfreq/data/combined_he.msgpack.gz
--- a/wordfreq/data/combined_hi.msgpack.gz
+++ b/wordfreq/data/combined_hi.msgpack.gz
--- a/wordfreq/data/combined_hu.msgpack.gz
+++ b/wordfreq/data/combined_hu.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_mk.msgpack.gz
+++ b/wordfreq/data/combined_mk.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nb.msgpack.gz
+++ b/wordfreq/data/combined_nb.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pl.msgpack.gz
+++ b/wordfreq/data/combined_pl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ro.msgpack.gz
+++ b/wordfreq/data/combined_ro.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_sh.msgpack.gz
+++ b/wordfreq/data/combined_sh.msgpack.gz
--- a/wordfreq/data/combined_sv.msgpack.gz
+++ b/wordfreq/data/combined_sv.msgpack.gz
--- a/wordfreq/data/combined_tr.msgpack.gz
+++ b/wordfreq/data/combined_tr.msgpack.gz
--- a/wordfreq/data/combined_uk.msgpack.gz
+++ b/wordfreq/data/combined_uk.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_bg.msgpack.gz
+++ b/wordfreq/data/twitter_bg.msgpack.gz
--- a/wordfreq/data/twitter_bn.msgpack.gz
+++ b/wordfreq/data/twitter_bn.msgpack.gz
--- a/wordfreq/data/twitter_ca.msgpack.gz
+++ b/wordfreq/data/twitter_ca.msgpack.gz
--- a/wordfreq/data/twitter_cs.msgpack.gz
+++ b/wordfreq/data/twitter_cs.msgpack.gz
--- a/wordfreq/data/twitter_da.msgpack.gz
+++ b/wordfreq/data/twitter_da.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fa.msgpack.gz
+++ b/wordfreq/data/twitter_fa.msgpack.gz
--- a/wordfreq/data/twitter_fi.msgpack.gz
+++ b/wordfreq/data/twitter_fi.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_he.msgpack.gz
+++ b/wordfreq/data/twitter_he.msgpack.gz
--- a/wordfreq/data/twitter_hi.msgpack.gz
+++ b/wordfreq/data/twitter_hi.msgpack.gz
--- a/wordfreq/data/twitter_hu.msgpack.gz
+++ b/wordfreq/data/twitter_hu.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nb.msgpack.gz
+++ b/wordfreq/data/twitter_nb.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pl.msgpack.gz
+++ b/wordfreq/data/twitter_pl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ro.msgpack.gz
+++ b/wordfreq/data/twitter_ro.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq/data/twitter_sh.msgpack.gz
+++ b/wordfreq/data/twitter_sh.msgpack.gz
--- a/wordfreq/data/twitter_sv.msgpack.gz
+++ b/wordfreq/data/twitter_sv.msgpack.gz
--- a/wordfreq/data/twitter_tr.msgpack.gz
+++ b/wordfreq/data/twitter_tr.msgpack.gz
--- a/wordfreq/data/twitter_uk.msgpack.gz
+++ b/wordfreq/data/twitter_uk.msgpack.gz