diff --git a/README.md b/README.md index 079bb05..ba8f3aa 100644 --- a/README.md +++ b/README.md @@ -102,45 +102,23 @@ The parameters to `word_frequency` and `zipf_frequency` are: value contained in the wordlist, to avoid a discontinuity where the wordlist ends. -Other functions: -`tokenize(text, lang)` splits text in the given language into words, in the same -way that the words in wordfreq's data were counted in the first place. See -*Tokenization*. +## Frequency bins -`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in -the list, in descending frequency order. +wordfreq's wordlists are designed to load quickly and take up little space in +the repository. We accomplish this by avoiding meaningless precision and +packing the words into frequency bins. - >>> from wordfreq import top_n_list - >>> top_n_list('en', 10) - ['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for'] +In wordfreq, all words that have the same Zipf frequency rounded to the nearest +hundredth have the same frequency. We don't store any more precision than that. +So instead of having to store that the frequency of a word is +.000011748975549395302, information that is mostly meaningless, we just store +the 600 possible frequency bins and the words they contain. - >>> top_n_list('es', 10) - ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se'] - -`iter_wordlist(lang, wordlist='best')` iterates through all the words in a -wordlist, in descending frequency order. - -`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in -a wordlist as a dictionary, for cases where you'll want to look up a lot of -words and don't need the wrapper that `word_frequency` provides. - -`supported_languages(wordlist='best')` returns a dictionary whose keys are -language codes, and whose values are the data file that will be loaded to -provide the requested wordlist in each language. - -`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)` -returns a selection of random words, separated by spaces. `bits_per_word=n` -will select each random word from 2^n words. - -If you happen to want an easy way to get [a memorable, xkcd-style -password][xkcd936] with 60 bits of entropy, this function will almost do the -job. In this case, you should actually run the similar function -`random_ascii_words`, limiting the selection to words that can be typed in -ASCII. But maybe you should just use [xkpa][]. - -[xkcd936]: https://xkcd.com/936/ -[xkpa]: https://github.com/beala/xkcd-password +Because the Zipf scale is a logarithmic scale, this preserves the same relative +precision no matter how far down you are in the word list. The frequency of any +word is precise to within 1%. (This is not a claim about _accuracy_, which it's +unclear how you'd even measure, just about _precision_.) ## Sources and supported languages @@ -224,6 +202,47 @@ between 1.0 and 3.0. These are available in 14 languages that are covered by enough data sources. +## Other functions + +`tokenize(text, lang)` splits text in the given language into words, in the same +way that the words in wordfreq's data were counted in the first place. See +*Tokenization*. + +`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in +the list, in descending frequency order. + + >>> from wordfreq import top_n_list + >>> top_n_list('en', 10) + ['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for'] + + >>> top_n_list('es', 10) + ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se'] + +`iter_wordlist(lang, wordlist='best')` iterates through all the words in a +wordlist, in descending frequency order. + +`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in +a wordlist as a dictionary, for cases where you'll want to look up a lot of +words and don't need the wrapper that `word_frequency` provides. + +`supported_languages(wordlist='best')` returns a dictionary whose keys are +language codes, and whose values are the data file that will be loaded to +provide the requested wordlist in each language. + +`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)` +returns a selection of random words, separated by spaces. `bits_per_word=n` +will select each random word from 2^n words. + +If you happen to want an easy way to get [a memorable, xkcd-style +password][xkcd936] with 60 bits of entropy, this function will almost do the +job. In this case, you should actually run the similar function +`random_ascii_words`, limiting the selection to words that can be typed in +ASCII. But maybe you should just use [xkpa][]. + +[xkcd936]: https://xkcd.com/936/ +[xkpa]: https://github.com/beala/xkcd-password + + ## Tokenization wordfreq uses the Python package `regex`, which is a more advanced diff --git a/tests/test_general.py b/tests/test_general.py index a89448c..74f9ef7 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -162,7 +162,7 @@ def test_phrase_freq(): ff = word_frequency("flip-flop", 'en') assert ff > 0 phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') - assert 1.0 / ff == pytest.approx(phrase_freq) + assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01) def test_not_really_random(): diff --git a/tests/test_japanese.py b/tests/test_japanese.py index 5e977cf..a032a5d 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -49,11 +49,11 @@ def test_combination(): gozai_freq = word_frequency('ござい', 'ja') masu_freq = word_frequency('ます', 'ja') - assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2) + assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01) assert ( 1.0 / word_frequency('おはようございます', 'ja') == - pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq) + pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01) ) diff --git a/tests/test_korean.py b/tests/test_korean.py index 96d599a..16e2d34 100644 --- a/tests/test_korean.py +++ b/tests/test_korean.py @@ -10,9 +10,9 @@ def test_combination(): gamsa_freq = word_frequency('감사', 'ko') habnida_freq = word_frequency('합니다', 'ko') - assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2) + assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01) assert ( 1.0 / word_frequency('감사합니다', 'ko') == - pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq) + pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01) ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 1c41998..6a993f1 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -249,7 +249,14 @@ def _word_frequency(word, lang, wordlist, minimum): # probability for each word break that was inferred. freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) - return max(freq, minimum) + # All our frequency data is only precise to within 1% anyway, so round + # it to 3 significant digits + unrounded = max(freq, minimum) + if unrounded == 0.: + return 0. + else: + leading_zeroes = math.floor(-math.log(unrounded, 10)) + return round(unrounded, leading_zeroes + 3) def word_frequency(word, lang, wordlist='best', minimum=0.):