Round frequencies to 3 significant digits

2024-12-23 09:21:37 +00:00 · 2018-06-15 15:42:54 -04:00 · 2018-06-15 15:42:54 -04:00 · 7a32b56c1c
commit 7a32b56c1c
parent a95b360563
5 changed files with 67 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -102,45 +102,23 @@ The parameters to `word_frequency` and `zipf_frequency` are:
  value contained in the wordlist, to avoid a discontinuity where the wordlist
  ends.

-Other functions:

-`tokenize(text, lang)` splits text in the given language into words, in the same
-way that the words in wordfreq's data were counted in the first place. See
-*Tokenization*.
+## Frequency bins

-`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
-the list, in descending frequency order.
+wordfreq's wordlists are designed to load quickly and take up little space in
+the repository.  We accomplish this by avoiding meaningless precision and
+packing the words into frequency bins.

-    >>> from wordfreq import top_n_list
-    >>> top_n_list('en', 10)
-    ['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
+In wordfreq, all words that have the same Zipf frequency rounded to the nearest
+hundredth have the same frequency. We don't store any more precision than that.
+So instead of having to store that the frequency of a word is
+.000011748975549395302, information that is mostly meaningless, we just store
+the 600 possible frequency bins and the words they contain.

-    >>> top_n_list('es', 10)
-    ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
-
-`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
-wordlist, in descending frequency order.
-
-`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
-a wordlist as a dictionary, for cases where you'll want to look up a lot of
-words and don't need the wrapper that `word_frequency` provides.
-
-`supported_languages(wordlist='best')` returns a dictionary whose keys are
-language codes, and whose values are the data file that will be loaded to
-provide the requested wordlist in each language.
-
-`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
-returns a selection of random words, separated by spaces. `bits_per_word=n`
-will select each random word from 2^n words.
-
-If you happen to want an easy way to get [a memorable, xkcd-style
-password][xkcd936] with 60 bits of entropy, this function will almost do the
-job. In this case, you should actually run the similar function
-`random_ascii_words`, limiting the selection to words that can be typed in
-ASCII. But maybe you should just use [xkpa][].
-
-[xkcd936]: https://xkcd.com/936/
-[xkpa]: https://github.com/beala/xkcd-password
+Because the Zipf scale is a logarithmic scale, this preserves the same relative
+precision no matter how far down you are in the word list. The frequency of any
+word is precise to within 1%. (This is not a claim about _accuracy_, which it's
+unclear how you'd even measure, just about _precision_.)


 ## Sources and supported languages
@ -224,6 +202,47 @@ between 1.0 and 3.0. These are available in 14 languages that are covered by
 enough data sources.


+## Other functions
+
+`tokenize(text, lang)` splits text in the given language into words, in the same
+way that the words in wordfreq's data were counted in the first place. See
+*Tokenization*.
+
+`top_n_list(lang, n, wordlist='best')` returns the most common *n* words in
+the list, in descending frequency order.
+
+    >>> from wordfreq import top_n_list
+    >>> top_n_list('en', 10)
+    ['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'that', 'for']
+
+    >>> top_n_list('es', 10)
+    ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'se']
+
+`iter_wordlist(lang, wordlist='best')` iterates through all the words in a
+wordlist, in descending frequency order.
+
+`get_frequency_dict(lang, wordlist='best')` returns all the frequencies in
+a wordlist as a dictionary, for cases where you'll want to look up a lot of
+words and don't need the wrapper that `word_frequency` provides.
+
+`supported_languages(wordlist='best')` returns a dictionary whose keys are
+language codes, and whose values are the data file that will be loaded to
+provide the requested wordlist in each language.
+
+`random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12)`
+returns a selection of random words, separated by spaces. `bits_per_word=n`
+will select each random word from 2^n words.
+
+If you happen to want an easy way to get [a memorable, xkcd-style
+password][xkcd936] with 60 bits of entropy, this function will almost do the
+job. In this case, you should actually run the similar function
+`random_ascii_words`, limiting the selection to words that can be typed in
+ASCII. But maybe you should just use [xkpa][].
+
+[xkcd936]: https://xkcd.com/936/
+[xkpa]: https://github.com/beala/xkcd-password
+
+
 ## Tokenization

 wordfreq uses the Python package `regex`, which is a more advanced
--- a/tests/test_general.py
+++ b/tests/test_general.py
@ -162,7 +162,7 @@ def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert ff > 0
    phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-    assert 1.0 / ff == pytest.approx(phrase_freq)
+    assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)


 def test_not_really_random():
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -49,11 +49,11 @@ def test_combination():
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')

-    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
+    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
    
    assert (
        1.0 / word_frequency('おはようございます', 'ja') ==
-        pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
+        pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
    )
    

--- a/tests/test_korean.py
+++ b/tests/test_korean.py
@ -10,9 +10,9 @@ def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')

-    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
+    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
    assert (
        1.0 / word_frequency('감사합니다', 'ko') ==
-        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
+        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
    )

--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -249,7 +249,14 @@ def _word_frequency(word, lang, wordlist, minimum):
        # probability for each word break that was inferred.
        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)

-    return max(freq, minimum)
+    # All our frequency data is only precise to within 1% anyway, so round
+    # it to 3 significant digits
+    unrounded = max(freq, minimum)
+    if unrounded == 0.:
+        return 0.
+    else:
+        leading_zeroes = math.floor(-math.log(unrounded, 10))
+        return round(unrounded, leading_zeroes + 3)


 def word_frequency(word, lang, wordlist='best', minimum=0.):