diff --git a/tests/test.py b/tests/test.py index 51734eb..f02323f 100644 --- a/tests/test.py +++ b/tests/test.py @@ -34,6 +34,15 @@ def test_languages(): assert_greater(word_frequency('lol', new_lang_code), 0) +def test_twitter(): + avail = available_languages('twitter') + assert_greater(len(avail), 12) + + for lang in avail: + assert_greater(word_frequency('rt', lang, 'twitter'), + word_frequency('rt', lang, 'combined')) + + def test_defaults(): eq_(word_frequency('esquivalience', 'en'), 0) eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 7f441ca..8451fd4 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -331,9 +331,8 @@ def half_harmonic_mean(a, b): def word_frequency(word, lang, wordlist='combined', default=0.): """ Get the frequency of `word` in the language with code `lang`, from the - specified `wordlist`. The default (and currently only) wordlist is - 'combined', built from whichever of these four sources have sufficient - data for the language: + specified `wordlist`. The default wordlist is 'combined', built from + whichever of these four sources have sufficient data for the language: - Full text of Wikipedia - A sample of 72 million tweets collected from Twitter in 2014, @@ -341,6 +340,9 @@ def word_frequency(word, lang, wordlist='combined', default=0.): - Frequencies extracted from OpenSubtitles - The Leeds Internet Corpus + Another available wordlist is 'twitter', which uses only the data from + Twitter. + Words that we believe occur at least once per million tokens, based on the average of these lists, will appear in the word frequency list. If you look up a word that's not in the list, you'll get the `default`