diff --git a/setup.py b/setup.py index f3f6a6a..5567e4e 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='1.0b3', + version='1.0b4', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/tests/test.py b/tests/test.py index 51734eb..f02323f 100644 --- a/tests/test.py +++ b/tests/test.py @@ -34,6 +34,15 @@ def test_languages(): assert_greater(word_frequency('lol', new_lang_code), 0) +def test_twitter(): + avail = available_languages('twitter') + assert_greater(len(avail), 12) + + for lang in avail: + assert_greater(word_frequency('rt', lang, 'twitter'), + word_frequency('rt', lang, 'combined')) + + def test_defaults(): eq_(word_frequency('esquivalience', 'en'), 0) eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 7f441ca..8451fd4 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -331,9 +331,8 @@ def half_harmonic_mean(a, b): def word_frequency(word, lang, wordlist='combined', default=0.): """ Get the frequency of `word` in the language with code `lang`, from the - specified `wordlist`. The default (and currently only) wordlist is - 'combined', built from whichever of these four sources have sufficient - data for the language: + specified `wordlist`. The default wordlist is 'combined', built from + whichever of these four sources have sufficient data for the language: - Full text of Wikipedia - A sample of 72 million tweets collected from Twitter in 2014, @@ -341,6 +340,9 @@ def word_frequency(word, lang, wordlist='combined', default=0.): - Frequencies extracted from OpenSubtitles - The Leeds Internet Corpus + Another available wordlist is 'twitter', which uses only the data from + Twitter. + Words that we believe occur at least once per million tokens, based on the average of these lists, will appear in the word frequency list. If you look up a word that's not in the list, you'll get the `default` diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz new file mode 100644 index 0000000..20939f9 Binary files /dev/null and b/wordfreq/data/twitter_ar.msgpack.gz differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz new file mode 100644 index 0000000..1329391 Binary files /dev/null and b/wordfreq/data/twitter_de.msgpack.gz differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz new file mode 100644 index 0000000..8017c56 Binary files /dev/null and b/wordfreq/data/twitter_en.msgpack.gz differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz new file mode 100644 index 0000000..936ec75 Binary files /dev/null and b/wordfreq/data/twitter_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz new file mode 100644 index 0000000..e41589a Binary files /dev/null and b/wordfreq/data/twitter_fr.msgpack.gz differ diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz new file mode 100644 index 0000000..b2bc598 Binary files /dev/null and b/wordfreq/data/twitter_id.msgpack.gz differ diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz new file mode 100644 index 0000000..5301ed7 Binary files /dev/null and b/wordfreq/data/twitter_it.msgpack.gz differ diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz new file mode 100644 index 0000000..74f33d5 Binary files /dev/null and b/wordfreq/data/twitter_ja.msgpack.gz differ diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz new file mode 100644 index 0000000..63735be Binary files /dev/null and b/wordfreq/data/twitter_ko.msgpack.gz differ diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz new file mode 100644 index 0000000..83d2b57 Binary files /dev/null and b/wordfreq/data/twitter_ms.msgpack.gz differ diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz new file mode 100644 index 0000000..b8d2281 Binary files /dev/null and b/wordfreq/data/twitter_nl.msgpack.gz differ diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz new file mode 100644 index 0000000..348d5a1 Binary files /dev/null and b/wordfreq/data/twitter_pt.msgpack.gz differ diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz new file mode 100644 index 0000000..9082723 Binary files /dev/null and b/wordfreq/data/twitter_ru.msgpack.gz differ