From 75a56b68fb19ab9f8a4e5a0bc2b2221cf2e6b463 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 18 Feb 2021 14:44:39 -0500 Subject: [PATCH] change math for INFERRED_SPACE_FACTOR to not overflow --- tests/test_chinese.py | 8 ++++++++ wordfreq/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_chinese.py b/tests/test_chinese.py index ce157db..83e2d70 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -77,3 +77,11 @@ def test_alternate_codes(): # Separate codes for Mandarin and Cantonese assert tokenize('谢谢谢谢', 'cmn') == tokens assert tokenize('谢谢谢谢', 'yue') == tokens + + +def test_unreasonably_long(): + # This crashed earlier versions of wordfreq + lots_of_ls = 'l' * 800 + assert word_frequency(lots_of_ls, 'zh') < 1e-300 + assert zipf_frequency(lots_of_ls, 'zh') == 0. + diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index bad4c92..17c910a 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum): # If we used the Jieba tokenizer, we could tokenize anything to match # our wordlist, even nonsense. To counteract this, we multiply by a # probability for each word break that was inferred. - freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) + freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1) # All our frequency data is only precise to within 1% anyway, so round # it to 3 significant digits