mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
reorganize wordlists into 'small', 'large', and 'best'
This commit is contained in:
parent
fe85b4e124
commit
45064a292f
@ -3,7 +3,7 @@ from wordfreq import (
|
||||
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||||
)
|
||||
from nose.tools import (
|
||||
eq_, assert_almost_equal, assert_greater, raises
|
||||
eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
|
||||
)
|
||||
|
||||
|
||||
@ -41,9 +41,24 @@ LAUGHTER_WORDS = {
|
||||
|
||||
|
||||
def test_languages():
|
||||
# Make sure the number of available languages doesn't decrease
|
||||
# Make sure we get all the languages when looking for the default
|
||||
# 'best' wordlist
|
||||
avail = available_languages()
|
||||
assert_greater(len(avail), 26)
|
||||
assert_greater(len(avail), 32)
|
||||
|
||||
# 'small' covers the same languages, but with some different lists
|
||||
avail_small = available_languages('small')
|
||||
eq_(len(avail_small), len(avail))
|
||||
assert_not_equal(avail_small, avail)
|
||||
|
||||
# 'combined' is the same as 'small'
|
||||
avail_old_name = available_languages('combined')
|
||||
eq_(avail_old_name, avail_small)
|
||||
|
||||
# 'large' covers fewer languages
|
||||
avail_large = available_languages('large')
|
||||
assert_greater(len(avail_large), 12)
|
||||
assert_greater(len(avail), len(avail_large))
|
||||
|
||||
# Look up the digit '2' in the main word list for each language
|
||||
for lang in avail:
|
||||
@ -55,17 +70,6 @@ def test_languages():
|
||||
assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
|
||||
|
||||
|
||||
def test_twitter():
|
||||
avail = available_languages('twitter')
|
||||
assert_greater(len(avail), 15)
|
||||
|
||||
for lang in avail:
|
||||
assert_greater(word_frequency('rt', lang, 'twitter'),
|
||||
word_frequency('rt', lang, 'combined'))
|
||||
text = LAUGHTER_WORDS.get(lang, 'haha')
|
||||
assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
|
||||
|
||||
|
||||
def test_minimums():
|
||||
eq_(word_frequency('esquivalience', 'en'), 0)
|
||||
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
|
||||
|
@ -90,11 +90,21 @@ def read_cBpack(filename):
|
||||
return data[1:]
|
||||
|
||||
|
||||
def available_languages(wordlist='combined'):
|
||||
def available_languages(wordlist='best'):
|
||||
"""
|
||||
List the languages (as language-code strings) that the wordlist of a given
|
||||
name is available in.
|
||||
Given a wordlist name, return a dictionary of language codes to filenames,
|
||||
representing all the languages in which that wordlist is available.
|
||||
"""
|
||||
if wordlist == 'best':
|
||||
available = available_languages('small')
|
||||
available.update(available_languages('large'))
|
||||
return available
|
||||
elif wordlist == 'combined':
|
||||
logger.warning(
|
||||
"The 'combined' wordlists have been renamed to 'small'."
|
||||
)
|
||||
wordlist = 'small'
|
||||
|
||||
available = {}
|
||||
for path in DATA_PATH.glob('*.msgpack.gz'):
|
||||
if not path.name.startswith('_'):
|
||||
@ -106,7 +116,7 @@ def available_languages(wordlist='combined'):
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
|
||||
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
|
||||
"""
|
||||
Read the raw data from a wordlist file, returning it as a list of
|
||||
lists. (See `read_cBpack` for what this represents.)
|
||||
@ -187,7 +197,7 @@ def freq_to_zipf(freq):
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
|
||||
def get_frequency_dict(lang, wordlist='best', match_cutoff=30):
|
||||
"""
|
||||
Get a word frequency list as a dictionary, mapping tokens to
|
||||
frequencies as floating-point probabilities.
|
||||
@ -201,7 +211,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
|
||||
return freqs
|
||||
|
||||
|
||||
def iter_wordlist(lang, wordlist='combined'):
|
||||
def iter_wordlist(lang, wordlist='best'):
|
||||
"""
|
||||
Yield the words in a wordlist in approximate descending order of
|
||||
frequency.
|
||||
@ -247,33 +257,18 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
return max(freq, minimum)
|
||||
|
||||
|
||||
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
def word_frequency(word, lang, wordlist='best', minimum=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
specified `wordlist`. The default wordlist is 'combined', built from
|
||||
whichever of these five sources have sufficient data for the language:
|
||||
specified `wordlist`.
|
||||
|
||||
- Full text of Wikipedia
|
||||
- A sample of 72 million tweets collected from Twitter in 2014,
|
||||
divided roughly into languages using automatic language detection
|
||||
- Frequencies extracted from OpenSubtitles
|
||||
- The Leeds Internet Corpus
|
||||
- Google Books Syntactic Ngrams 2013
|
||||
These wordlists can be specified:
|
||||
|
||||
Another available wordlist is 'twitter', which uses only the data from
|
||||
Twitter.
|
||||
|
||||
Words that we believe occur at least once per million tokens, based on
|
||||
the average of these lists, will appear in the word frequency list.
|
||||
|
||||
The value returned will always be at least as large as `minimum`.
|
||||
|
||||
If a word decomposes into multiple tokens, we'll return a smoothed estimate
|
||||
of the word frequency that is no greater than the frequency of any of its
|
||||
individual tokens.
|
||||
|
||||
It should be noted that the current tokenizer does not support
|
||||
multi-word Chinese phrases.
|
||||
- 'large': a wordlist built from at least 5 sources, containing word
|
||||
frequencies of 10^-8 and higher
|
||||
- 'small': a wordlist built from at least 3 sources, containing word
|
||||
frquencies of 10^-6 and higher
|
||||
- 'best': uses 'large' if available, and 'small' otherwise
|
||||
"""
|
||||
args = (word, lang, wordlist, minimum)
|
||||
try:
|
||||
@ -285,7 +280,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
return _wf_cache[args]
|
||||
|
||||
|
||||
def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
def zipf_frequency(word, lang, wordlist='best', minimum=0.):
|
||||
"""
|
||||
Get the frequency of `word`, in the language with code `lang`, on the Zipf
|
||||
scale.
|
||||
@ -313,7 +308,7 @@ def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
|
||||
|
||||
@lru_cache(maxsize=100)
|
||||
def top_n_list(lang, n, wordlist='combined', ascii_only=False):
|
||||
def top_n_list(lang, n, wordlist='best', ascii_only=False):
|
||||
"""
|
||||
Return a frequency list of length `n` in descending order of frequency.
|
||||
This list contains words from `wordlist`, of the given language.
|
||||
@ -328,7 +323,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False):
|
||||
return results
|
||||
|
||||
|
||||
def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
|
||||
def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
|
||||
ascii_only=False):
|
||||
"""
|
||||
Returns a string of random, space separated words.
|
||||
@ -353,7 +348,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
|
||||
return ' '.join([random.choice(choices) for i in range(nwords)])
|
||||
|
||||
|
||||
def random_ascii_words(lang='en', wordlist='combined', nwords=5,
|
||||
def random_ascii_words(lang='en', wordlist='best', nwords=5,
|
||||
bits_per_word=12):
|
||||
"""
|
||||
Returns a string of random, space separated, ASCII words.
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user