mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #14 from LuminosoInsight/add-twitter-wordlists
Add twitter wordlists
Former-commit-id: 2acca8a27a
This commit is contained in:
commit
98788628e9
2
setup.py
2
setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.0b3',
|
||||
version='1.0b4',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -34,6 +34,15 @@ def test_languages():
|
||||
assert_greater(word_frequency('lol', new_lang_code), 0)
|
||||
|
||||
|
||||
def test_twitter():
|
||||
avail = available_languages('twitter')
|
||||
assert_greater(len(avail), 12)
|
||||
|
||||
for lang in avail:
|
||||
assert_greater(word_frequency('rt', lang, 'twitter'),
|
||||
word_frequency('rt', lang, 'combined'))
|
||||
|
||||
|
||||
def test_defaults():
|
||||
eq_(word_frequency('esquivalience', 'en'), 0)
|
||||
eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6)
|
||||
|
@ -331,9 +331,8 @@ def half_harmonic_mean(a, b):
|
||||
def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
specified `wordlist`. The default (and currently only) wordlist is
|
||||
'combined', built from whichever of these four sources have sufficient
|
||||
data for the language:
|
||||
specified `wordlist`. The default wordlist is 'combined', built from
|
||||
whichever of these four sources have sufficient data for the language:
|
||||
|
||||
- Full text of Wikipedia
|
||||
- A sample of 72 million tweets collected from Twitter in 2014,
|
||||
@ -341,6 +340,9 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||
- Frequencies extracted from OpenSubtitles
|
||||
- The Leeds Internet Corpus
|
||||
|
||||
Another available wordlist is 'twitter', which uses only the data from
|
||||
Twitter.
|
||||
|
||||
Words that we believe occur at least once per million tokens, based on
|
||||
the average of these lists, will appear in the word frequency list.
|
||||
If you look up a word that's not in the list, you'll get the `default`
|
||||
|
BIN
wordfreq/data/twitter_ar.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ar.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_de.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_de.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_en.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_en.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_es.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_es.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_fr.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fr.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_id.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_id.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_it.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_it.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_ja.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ja.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_ko.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ko.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_ms.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ms.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_nl.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_nl.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_pt.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_pt.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_ru.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ru.msgpack.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user