mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Round frequencies to 3 significant digits
This commit is contained in:
parent
a95b360563
commit
7a32b56c1c
89
README.md
89
README.md
@ -102,45 +102,23 @@ The parameters to `word_frequency` and `zipf_frequency` are:
|
|||||||
value contained in the wordlist, to avoid a discontinuity where the wordlist
|
value contained in the wordlist, to avoid a discontinuity where the wordlist
|
||||||
ends.
|
ends.
|
||||||
|
|
||||||
Other functions:
|
|
||||||
|
|
||||||
`tokenize(text, lang)` splits text in the given language into words, in the same
|
## Frequency bins
|
||||||
way that the words in wordfreq's data were counted in the first place. See
|
|
||||||
*Tokenization*.
|
|
||||||
|
|
||||||
`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
|
wordfreq's wordlists are designed to load quickly and take up little space in
|
||||||
the list, in descending frequency order.
|
the repository. We accomplish this by avoiding meaningless precision and
|
||||||
|
packing the words into frequency bins.
|
||||||
|
|
||||||
>>> from wordfreq import top_n_list
|
In wordfreq, all words that have the same Zipf frequency rounded to the nearest
|
||||||
>>> top_n_list('en', 10)
|
hundredth have the same frequency. We don't store any more precision than that.
|
||||||
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
|
So instead of having to store that the frequency of a word is
|
||||||
|
.000011748975549395302, information that is mostly meaningless, we just store
|
||||||
|
the 600 possible frequency bins and the words they contain.
|
||||||
|
|
||||||
>>> top_n_list('es', 10)
|
Because the Zipf scale is a logarithmic scale, this preserves the same relative
|
||||||
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
|
precision no matter how far down you are in the word list. The frequency of any
|
||||||
|
word is precise to within 1%. (This is not a claim about _accuracy_, which it's
|
||||||
`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
|
unclear how you'd even measure, just about _precision_.)
|
||||||
wordlist, in descending frequency order.
|
|
||||||
|
|
||||||
`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
|
|
||||||
a wordlist as a dictionary, for cases where you'll want to look up a lot of
|
|
||||||
words and don't need the wrapper that `word_frequency` provides.
|
|
||||||
|
|
||||||
`supported_languages(wordlist='best')` returns a dictionary whose keys are
|
|
||||||
language codes, and whose values are the data file that will be loaded to
|
|
||||||
provide the requested wordlist in each language.
|
|
||||||
|
|
||||||
`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
|
|
||||||
returns a selection of random words, separated by spaces. `bits_per_word=n`
|
|
||||||
will select each random word from 2^n words.
|
|
||||||
|
|
||||||
If you happen to want an easy way to get [a memorable, xkcd-style
|
|
||||||
password][xkcd936] with 60 bits of entropy, this function will almost do the
|
|
||||||
job. In this case, you should actually run the similar function
|
|
||||||
`random_ascii_words`, limiting the selection to words that can be typed in
|
|
||||||
ASCII. But maybe you should just use [xkpa][].
|
|
||||||
|
|
||||||
[xkcd936]: https://xkcd.com/936/
|
|
||||||
[xkpa]: https://github.com/beala/xkcd-password
|
|
||||||
|
|
||||||
|
|
||||||
## Sources and supported languages
|
## Sources and supported languages
|
||||||
@ -224,6 +202,47 @@ between 1.0 and 3.0. These are available in 14 languages that are covered by
|
|||||||
enough data sources.
|
enough data sources.
|
||||||
|
|
||||||
|
|
||||||
|
## Other functions
|
||||||
|
|
||||||
|
`tokenize(text, lang)` splits text in the given language into words, in the same
|
||||||
|
way that the words in wordfreq's data were counted in the first place. See
|
||||||
|
*Tokenization*.
|
||||||
|
|
||||||
|
`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
|
||||||
|
the list, in descending frequency order.
|
||||||
|
|
||||||
|
>>> from wordfreq import top_n_list
|
||||||
|
>>> top_n_list('en', 10)
|
||||||
|
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
|
||||||
|
|
||||||
|
>>> top_n_list('es', 10)
|
||||||
|
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
|
||||||
|
|
||||||
|
`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
|
||||||
|
wordlist, in descending frequency order.
|
||||||
|
|
||||||
|
`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
|
||||||
|
a wordlist as a dictionary, for cases where you'll want to look up a lot of
|
||||||
|
words and don't need the wrapper that `word_frequency` provides.
|
||||||
|
|
||||||
|
`supported_languages(wordlist='best')` returns a dictionary whose keys are
|
||||||
|
language codes, and whose values are the data file that will be loaded to
|
||||||
|
provide the requested wordlist in each language.
|
||||||
|
|
||||||
|
`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
|
||||||
|
returns a selection of random words, separated by spaces. `bits_per_word=n`
|
||||||
|
will select each random word from 2^n words.
|
||||||
|
|
||||||
|
If you happen to want an easy way to get [a memorable, xkcd-style
|
||||||
|
password][xkcd936] with 60 bits of entropy, this function will almost do the
|
||||||
|
job. In this case, you should actually run the similar function
|
||||||
|
`random_ascii_words`, limiting the selection to words that can be typed in
|
||||||
|
ASCII. But maybe you should just use [xkpa][].
|
||||||
|
|
||||||
|
[xkcd936]: https://xkcd.com/936/
|
||||||
|
[xkpa]: https://github.com/beala/xkcd-password
|
||||||
|
|
||||||
|
|
||||||
## Tokenization
|
## Tokenization
|
||||||
|
|
||||||
wordfreq uses the Python package `regex`, which is a more advanced
|
wordfreq uses the Python package `regex`, which is a more advanced
|
||||||
|
@ -162,7 +162,7 @@ def test_phrase_freq():
|
|||||||
ff = word_frequency("flip-flop", 'en')
|
ff = word_frequency("flip-flop", 'en')
|
||||||
assert ff > 0
|
assert ff > 0
|
||||||
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||||
assert 1.0 / ff == pytest.approx(phrase_freq)
|
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
|
||||||
|
|
||||||
|
|
||||||
def test_not_really_random():
|
def test_not_really_random():
|
||||||
|
@ -49,11 +49,11 @@ def test_combination():
|
|||||||
gozai_freq = word_frequency('ござい', 'ja')
|
gozai_freq = word_frequency('ござい', 'ja')
|
||||||
masu_freq = word_frequency('ます', 'ja')
|
masu_freq = word_frequency('ます', 'ja')
|
||||||
|
|
||||||
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
|
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
1.0 / word_frequency('おはようございます', 'ja') ==
|
1.0 / word_frequency('おはようございます', 'ja') ==
|
||||||
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
|
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,9 +10,9 @@ def test_combination():
|
|||||||
gamsa_freq = word_frequency('감사', 'ko')
|
gamsa_freq = word_frequency('감사', 'ko')
|
||||||
habnida_freq = word_frequency('합니다', 'ko')
|
habnida_freq = word_frequency('합니다', 'ko')
|
||||||
|
|
||||||
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
|
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
|
||||||
assert (
|
assert (
|
||||||
1.0 / word_frequency('감사합니다', 'ko') ==
|
1.0 / word_frequency('감사합니다', 'ko') ==
|
||||||
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
|
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -249,7 +249,14 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
# probability for each word break that was inferred.
|
# probability for each word break that was inferred.
|
||||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||||
|
|
||||||
return max(freq, minimum)
|
# All our frequency data is only precise to within 1% anyway, so round
|
||||||
|
# it to 3 significant digits
|
||||||
|
unrounded = max(freq, minimum)
|
||||||
|
if unrounded == 0.:
|
||||||
|
return 0.
|
||||||
|
else:
|
||||||
|
leading_zeroes = math.floor(-math.log(unrounded, 10))
|
||||||
|
return round(unrounded, leading_zeroes + 3)
|
||||||
|
|
||||||
|
|
||||||
def word_frequency(word, lang, wordlist='best', minimum=0.):
|
def word_frequency(word, lang, wordlist='best', minimum=0.):
|
||||||
|
Loading…
Reference in New Issue
Block a user