Merge pull request #14 from LuminosoInsight/add-twitter-wordlists

Add twitter wordlists

Former-commit-id: 2acca8a27a
This commit is contained in:
Joshua Chin 2015-07-01 18:00:30 -04:00
commit 98788628e9
16 changed files with 15 additions and 4 deletions

View File

@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='1.0b3',
version='1.0b4',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -34,6 +34,15 @@ def test_languages():
assert_greater(word_frequency('lol', new_lang_code), 0)
def test_twitter():
avail = available_languages('twitter')
assert_greater(len(avail), 12)
for lang in avail:
assert_greater(word_frequency('rt', lang, 'twitter'),
word_frequency('rt', lang, 'combined'))
def test_defaults():
eq_(word_frequency('esquivalience', 'en'), 0)
eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6)

View File

@ -331,9 +331,8 @@ def half_harmonic_mean(a, b):
def word_frequency(word, lang, wordlist='combined', default=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`. The default (and currently only) wordlist is
'combined', built from whichever of these four sources have sufficient
data for the language:
specified `wordlist`. The default wordlist is 'combined', built from
whichever of these four sources have sufficient data for the language:
- Full text of Wikipedia
- A sample of 72 million tweets collected from Twitter in 2014,
@ -341,6 +340,9 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
- Frequencies extracted from OpenSubtitles
- The Leeds Internet Corpus
Another available wordlist is 'twitter', which uses only the data from
Twitter.
Words that we believe occur at least once per million tokens, based on
the average of these lists, will appear in the word frequency list.
If you look up a word that's not in the list, you'll get the `default`

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.