Round frequencies to 3 significant digits

2024-12-23 09:21:37 +00:00 · 2018-06-15 15:42:54 -04:00 · 2018-06-15 15:42:54 -04:00 · 7a32b56c1c
commit 7a32b56c1c
parent a95b360563
5 changed files with 67 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -102,45 +102,23 @@ The parameters to `word_frequency` and `zipf_frequency` are:
  value contained in the wordlist, to avoid a discontinuity where the wordlist
  ends.
 Other functions:
-`tokenize(text, lang)` splits text in the given language into words, in the same
+## Frequency bins
 way that the words in wordfreq's data were counted in the first place. See
 *Tokenization*.
-`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
+wordfreq's wordlists are designed to load quickly and take up little space in
-the list, in descending frequency order.
+the repository.  We accomplish this by avoiding meaningless precision and
 packing the words into frequency bins.
-    >>> from wordfreq import top_n_list
+In wordfreq, all words that have the same Zipf frequency rounded to the nearest
-    >>> top_n_list('en', 10)
+hundredth have the same frequency. We don't store any more precision than that.
-    ['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
+So instead of having to store that the frequency of a word is
 .000011748975549395302, information that is mostly meaningless, we just store
 the 600 possible frequency bins and the words they contain.
-    >>> top_n_list('es', 10)
+Because the Zipf scale is a logarithmic scale, this preserves the same relative
-    ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
+precision no matter how far down you are in the word list. The frequency of any
-
+word is precise to within 1%. (This is not a claim about _accuracy_, which it's
-`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
+unclear how you'd even measure, just about _precision_.)
 wordlist, in descending frequency order.
 `get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
 a wordlist as a dictionary, for cases where you'll want to look up a lot of
 words and don't need the wrapper that `word_frequency` provides.
 `supported_languages(wordlist='best')` returns a dictionary whose keys are
 language codes, and whose values are the data file that will be loaded to
 provide the requested wordlist in each language.
 `random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
 returns a selection of random words, separated by spaces. `bits_per_word=n`
 will select each random word from 2^n words.
 If you happen to want an easy way to get [a memorable, xkcd-style
 password][xkcd936] with 60 bits of entropy, this function will almost do the
 job. In this case, you should actually run the similar function
 `random_ascii_words`, limiting the selection to words that can be typed in
 ASCII. But maybe you should just use [xkpa][].
 [xkcd936]: https://xkcd.com/936/
 [xkpa]: https://github.com/beala/xkcd-password
 ## Sources and supported languages
@ -224,6 +202,47 @@ between 1.0 and 3.0. These are available in 14 languages that are covered by
 enough data sources.
 ## Other functions
 `tokenize(text, lang)` splits text in the given language into words, in the same
 way that the words in wordfreq's data were counted in the first place. See
 *Tokenization*.
 `top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
 the list, in descending frequency order.
    >>> from wordfreq import top_n_list
    >>> top_n_list('en', 10)
    ['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
    >>> top_n_list('es', 10)
    ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
 `iter_wordlist(lang, wordlist='best')` iterates through all the words in a
 wordlist, in descending frequency order.
 `get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
 a wordlist as a dictionary, for cases where you'll want to look up a lot of
 words and don't need the wrapper that `word_frequency` provides.
 `supported_languages(wordlist='best')` returns a dictionary whose keys are
 language codes, and whose values are the data file that will be loaded to
 provide the requested wordlist in each language.
 `random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
 returns a selection of random words, separated by spaces. `bits_per_word=n`
 will select each random word from 2^n words.
 If you happen to want an easy way to get [a memorable, xkcd-style
 password][xkcd936] with 60 bits of entropy, this function will almost do the
 job. In this case, you should actually run the similar function
 `random_ascii_words`, limiting the selection to words that can be typed in
 ASCII. But maybe you should just use [xkpa][].
 [xkcd936]: https://xkcd.com/936/
 [xkpa]: https://github.com/beala/xkcd-password
 ## Tokenization
 wordfreq uses the Python package `regex`, which is a more advanced
--- a/tests/test_general.py
+++ b/tests/test_general.py
@ -162,7 +162,7 @@ def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert ff > 0
    phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-    assert 1.0 / ff == pytest.approx(phrase_freq)
+    assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
 def test_not_really_random():
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -49,11 +49,11 @@ def test_combination():
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')
-    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
+    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
    assert (
        1.0 / word_frequency('おはようございます', 'ja') ==
-        pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
+        pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
    )
--- a/tests/test_korean.py
+++ b/tests/test_korean.py
@ -10,9 +10,9 @@ def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')
-    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
+    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
    assert (
        1.0 / word_frequency('감사합니다', 'ko') ==
-        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
+        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -249,7 +249,14 @@ def _word_frequency(word, lang, wordlist, minimum):
        # probability for each word break that was inferred.
        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
-    return max(freq, minimum)
+    # All our frequency data is only precise to within 1% anyway, so round
    # it to 3 significant digits
    unrounded = max(freq, minimum)
    if unrounded == 0.:
        return 0.
    else:
        leading_zeroes = math.floor(-math.log(unrounded, 10))
        return round(unrounded, leading_zeroes + 3)
 def word_frequency(word, lang, wordlist='best', minimum=0.):