change math for INFERRED_SPACE_FACTOR to not overflow

2024-12-23 09:21:37 +00:00 · 2021-02-18 14:44:39 -05:00 · 2021-02-18 14:44:39 -05:00 · bd57b64d00
commit bd57b64d00
parent 02c3cbe3fb
2 changed files with 9 additions and 1 deletions
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -77,3 +77,11 @@ def test_alternate_codes():
    # Separate codes for Mandarin and Cantonese
    assert tokenize('谢谢谢谢', 'cmn') == tokens
    assert tokenize('谢谢谢谢', 'yue') == tokens
+
+
+def test_unreasonably_long():
+    # This crashed earlier versions of wordfreq
+    lots_of_ls = 'l' * 800
+    assert word_frequency(lots_of_ls, 'zh') < 1e-300
+    assert zipf_frequency(lots_of_ls, 'zh') == 0.
+
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
        # If we used the Jieba tokenizer, we could tokenize anything to match
        # our wordlist, even nonsense. To counteract this, we multiply by a
        # probability for each word break that was inferred.
-        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
+        freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)

    # All our frequency data is only precise to within 1% anyway, so round
    # it to 3 significant digits