test and document new twitter wordlists

2024-12-24 01:41:39 +00:00 · 2015-07-01 17:53:38 -04:00 · 2015-07-01 17:53:38 -04:00 · 14cb408100
commit 14cb408100
parent 7e3066d3fc
2 changed files with 14 additions and 3 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -34,6 +34,15 @@ def test_languages():
            assert_greater(word_frequency('lol', new_lang_code), 0)
 def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 12)
    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
 def test_defaults():
    eq_(word_frequency('esquivalience', 'en'), 0)
    eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6)
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -331,9 +331,8 @@ def half_harmonic_mean(a, b):
 def word_frequency(word, lang, wordlist='combined', default=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
-    specified `wordlist`. The default (and currently only) wordlist is
+    specified `wordlist`. The default wordlist is 'combined', built from
-    'combined', built from whichever of these four sources have sufficient
+    whichever of these four sources have sufficient data for the language:
    data for the language:
      - Full text of Wikipedia
      - A sample of 72 million tweets collected from Twitter in 2014,
@ -341,6 +340,9 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
      - Frequencies extracted from OpenSubtitles
      - The Leeds Internet Corpus
    Another available wordlist is 'twitter', which uses only the data from
    Twitter.
    Words that we believe occur at least once per million tokens, based on
    the average of these lists, will appear in the word frequency list.
    If you look up a word that's not in the list, you'll get the `default`