mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Round frequencies to 3 significant digits
This commit is contained in:
parent
a95b360563
commit
7a32b56c1c
89
README.md
89
README.md
@ -102,45 +102,23 @@ The parameters to `word_frequency` and `zipf_frequency` are:
|
||||
value contained in the wordlist, to avoid a discontinuity where the wordlist
|
||||
ends.
|
||||
|
||||
Other functions:
|
||||
|
||||
`tokenize(text, lang)` splits text in the given language into words, in the same
|
||||
way that the words in wordfreq's data were counted in the first place. See
|
||||
*Tokenization*.
|
||||
## Frequency bins
|
||||
|
||||
`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
|
||||
the list, in descending frequency order.
|
||||
wordfreq's wordlists are designed to load quickly and take up little space in
|
||||
the repository. We accomplish this by avoiding meaningless precision and
|
||||
packing the words into frequency bins.
|
||||
|
||||
>>> from wordfreq import top_n_list
|
||||
>>> top_n_list('en', 10)
|
||||
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
|
||||
In wordfreq, all words that have the same Zipf frequency rounded to the nearest
|
||||
hundredth have the same frequency. We don't store any more precision than that.
|
||||
So instead of having to store that the frequency of a word is
|
||||
.000011748975549395302, information that is mostly meaningless, we just store
|
||||
the 600 possible frequency bins and the words they contain.
|
||||
|
||||
>>> top_n_list('es', 10)
|
||||
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
|
||||
|
||||
`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
|
||||
wordlist, in descending frequency order.
|
||||
|
||||
`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
|
||||
a wordlist as a dictionary, for cases where you'll want to look up a lot of
|
||||
words and don't need the wrapper that `word_frequency` provides.
|
||||
|
||||
`supported_languages(wordlist='best')` returns a dictionary whose keys are
|
||||
language codes, and whose values are the data file that will be loaded to
|
||||
provide the requested wordlist in each language.
|
||||
|
||||
`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
|
||||
returns a selection of random words, separated by spaces. `bits_per_word=n`
|
||||
will select each random word from 2^n words.
|
||||
|
||||
If you happen to want an easy way to get [a memorable, xkcd-style
|
||||
password][xkcd936] with 60 bits of entropy, this function will almost do the
|
||||
job. In this case, you should actually run the similar function
|
||||
`random_ascii_words`, limiting the selection to words that can be typed in
|
||||
ASCII. But maybe you should just use [xkpa][].
|
||||
|
||||
[xkcd936]: https://xkcd.com/936/
|
||||
[xkpa]: https://github.com/beala/xkcd-password
|
||||
Because the Zipf scale is a logarithmic scale, this preserves the same relative
|
||||
precision no matter how far down you are in the word list. The frequency of any
|
||||
word is precise to within 1%. (This is not a claim about _accuracy_, which it's
|
||||
unclear how you'd even measure, just about _precision_.)
|
||||
|
||||
|
||||
## Sources and supported languages
|
||||
@ -224,6 +202,47 @@ between 1.0 and 3.0. These are available in 14 languages that are covered by
|
||||
enough data sources.
|
||||
|
||||
|
||||
## Other functions
|
||||
|
||||
`tokenize(text, lang)` splits text in the given language into words, in the same
|
||||
way that the words in wordfreq's data were counted in the first place. See
|
||||
*Tokenization*.
|
||||
|
||||
`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
|
||||
the list, in descending frequency order.
|
||||
|
||||
>>> from wordfreq import top_n_list
|
||||
>>> top_n_list('en', 10)
|
||||
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
|
||||
|
||||
>>> top_n_list('es', 10)
|
||||
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
|
||||
|
||||
`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
|
||||
wordlist, in descending frequency order.
|
||||
|
||||
`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
|
||||
a wordlist as a dictionary, for cases where you'll want to look up a lot of
|
||||
words and don't need the wrapper that `word_frequency` provides.
|
||||
|
||||
`supported_languages(wordlist='best')` returns a dictionary whose keys are
|
||||
language codes, and whose values are the data file that will be loaded to
|
||||
provide the requested wordlist in each language.
|
||||
|
||||
`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
|
||||
returns a selection of random words, separated by spaces. `bits_per_word=n`
|
||||
will select each random word from 2^n words.
|
||||
|
||||
If you happen to want an easy way to get [a memorable, xkcd-style
|
||||
password][xkcd936] with 60 bits of entropy, this function will almost do the
|
||||
job. In this case, you should actually run the similar function
|
||||
`random_ascii_words`, limiting the selection to words that can be typed in
|
||||
ASCII. But maybe you should just use [xkpa][].
|
||||
|
||||
[xkcd936]: https://xkcd.com/936/
|
||||
[xkpa]: https://github.com/beala/xkcd-password
|
||||
|
||||
|
||||
## Tokenization
|
||||
|
||||
wordfreq uses the Python package `regex`, which is a more advanced
|
||||
|
@ -162,7 +162,7 @@ def test_phrase_freq():
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
assert ff > 0
|
||||
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||
assert 1.0 / ff == pytest.approx(phrase_freq)
|
||||
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
|
||||
|
||||
|
||||
def test_not_really_random():
|
||||
|
@ -49,11 +49,11 @@ def test_combination():
|
||||
gozai_freq = word_frequency('ござい', 'ja')
|
||||
masu_freq = word_frequency('ます', 'ja')
|
||||
|
||||
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
|
||||
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
|
||||
|
||||
assert (
|
||||
1.0 / word_frequency('おはようございます', 'ja') ==
|
||||
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
|
||||
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
|
||||
)
|
||||
|
||||
|
||||
|
@ -10,9 +10,9 @@ def test_combination():
|
||||
gamsa_freq = word_frequency('감사', 'ko')
|
||||
habnida_freq = word_frequency('합니다', 'ko')
|
||||
|
||||
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
|
||||
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
|
||||
assert (
|
||||
1.0 / word_frequency('감사합니다', 'ko') ==
|
||||
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
|
||||
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
|
||||
)
|
||||
|
||||
|
@ -249,7 +249,14 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
# probability for each word break that was inferred.
|
||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||
|
||||
return max(freq, minimum)
|
||||
# All our frequency data is only precise to within 1% anyway, so round
|
||||
# it to 3 significant digits
|
||||
unrounded = max(freq, minimum)
|
||||
if unrounded == 0.:
|
||||
return 0.
|
||||
else:
|
||||
leading_zeroes = math.floor(-math.log(unrounded, 10))
|
||||
return round(unrounded, leading_zeroes + 3)
|
||||
|
||||
|
||||
def word_frequency(word, lang, wordlist='best', minimum=0.):
|
||||
|
Loading…
Reference in New Issue
Block a user