reorganize wordlists into 'small', 'large', and 'best'

This commit is contained in:
Robyn Speer 2018-03-08 17:52:44 -05:00
parent fe85b4e124
commit 45064a292f
65 changed files with 46 additions and 47 deletions

View File

@ -3,7 +3,7 @@ from wordfreq import (
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
)
from nose.tools import (
eq_, assert_almost_equal, assert_greater, raises
eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
)
@ -41,9 +41,24 @@ LAUGHTER_WORDS = {
def test_languages():
# Make sure the number of available languages doesn't decrease
# Make sure we get all the languages when looking for the default
# 'best' wordlist
avail = available_languages()
assert_greater(len(avail), 26)
assert_greater(len(avail), 32)
# 'small' covers the same languages, but with some different lists
avail_small = available_languages('small')
eq_(len(avail_small), len(avail))
assert_not_equal(avail_small, avail)
# 'combined' is the same as 'small'
avail_old_name = available_languages('combined')
eq_(avail_old_name, avail_small)
# 'large' covers fewer languages
avail_large = available_languages('large')
assert_greater(len(avail_large), 12)
assert_greater(len(avail), len(avail_large))
# Look up the digit '2' in the main word list for each language
for lang in avail:
@ -55,17 +70,6 @@ def test_languages():
assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
def test_twitter():
avail = available_languages('twitter')
assert_greater(len(avail), 15)
for lang in avail:
assert_greater(word_frequency('rt', lang, 'twitter'),
word_frequency('rt', lang, 'combined'))
text = LAUGHTER_WORDS.get(lang, 'haha')
assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
def test_minimums():
eq_(word_frequency('esquivalience', 'en'), 0)
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)

View File

@ -90,11 +90,21 @@ def read_cBpack(filename):
return data[1:]
def available_languages(wordlist='combined'):
def available_languages(wordlist='best'):
"""
List the languages (as language-code strings) that the wordlist of a given
name is available in.
Given a wordlist name, return a dictionary of language codes to filenames,
representing all the languages in which that wordlist is available.
"""
if wordlist == 'best':
available = available_languages('small')
available.update(available_languages('large'))
return available
elif wordlist == 'combined':
logger.warning(
"The 'combined' wordlists have been renamed to 'small'."
)
wordlist = 'small'
available = {}
for path in DATA_PATH.glob('*.msgpack.gz'):
if not path.name.startswith('_'):
@ -106,7 +116,7 @@ def available_languages(wordlist='combined'):
@lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
"""
Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_cBpack` for what this represents.)
@ -187,7 +197,7 @@ def freq_to_zipf(freq):
@lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
def get_frequency_dict(lang, wordlist='best', match_cutoff=30):
"""
Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities.
@ -201,7 +211,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
return freqs
def iter_wordlist(lang, wordlist='combined'):
def iter_wordlist(lang, wordlist='best'):
"""
Yield the words in a wordlist in approximate descending order of
frequency.
@ -247,33 +257,18 @@ def _word_frequency(word, lang, wordlist, minimum):
return max(freq, minimum)
def word_frequency(word, lang, wordlist='combined', minimum=0.):
def word_frequency(word, lang, wordlist='best', minimum=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`. The default wordlist is 'combined', built from
whichever of these five sources have sufficient data for the language:
specified `wordlist`.
- Full text of Wikipedia
- A sample of 72 million tweets collected from Twitter in 2014,
divided roughly into languages using automatic language detection
- Frequencies extracted from OpenSubtitles
- The Leeds Internet Corpus
- Google Books Syntactic Ngrams 2013
These wordlists can be specified:
Another available wordlist is 'twitter', which uses only the data from
Twitter.
Words that we believe occur at least once per million tokens, based on
the average of these lists, will appear in the word frequency list.
The value returned will always be at least as large as `minimum`.
If a word decomposes into multiple tokens, we'll return a smoothed estimate
of the word frequency that is no greater than the frequency of any of its
individual tokens.
It should be noted that the current tokenizer does not support
multi-word Chinese phrases.
- 'large': a wordlist built from at least 5 sources, containing word
frequencies of 10^-8 and higher
- 'small': a wordlist built from at least 3 sources, containing word
frquencies of 10^-6 and higher
- 'best': uses 'large' if available, and 'small' otherwise
"""
args = (word, lang, wordlist, minimum)
try:
@ -285,7 +280,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
return _wf_cache[args]
def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
def zipf_frequency(word, lang, wordlist='best', minimum=0.):
"""
Get the frequency of `word`, in the language with code `lang`, on the Zipf
scale.
@ -313,7 +308,7 @@ def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
@lru_cache(maxsize=100)
def top_n_list(lang, n, wordlist='combined', ascii_only=False):
def top_n_list(lang, n, wordlist='best', ascii_only=False):
"""
Return a frequency list of length `n` in descending order of frequency.
This list contains words from `wordlist`, of the given language.
@ -328,7 +323,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False):
return results
def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
ascii_only=False):
"""
Returns a string of random, space separated words.
@ -353,7 +348,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
return ' '.join([random.choice(choices) for i in range(nwords)])
def random_ascii_words(lang='en', wordlist='combined', nwords=5,
def random_ascii_words(lang='en', wordlist='best', nwords=5,
bits_per_word=12):
"""
Returns a string of random, space separated, ASCII words.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.