From 75a56b68fb19ab9f8a4e5a0bc2b2221cf2e6b463 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 18 Feb 2021 14:44:39 -0500
Subject: [PATCH] change math for INFERRED_SPACE_FACTOR to not overflow

---
 tests/test_chinese.py | 8 ++++++++
 wordfreq/__init__.py  | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/test_chinese.py b/tests/test_chinese.py
index ce157db..83e2d70 100644
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@@ -77,3 +77,11 @@ def test_alternate_codes():
     # Separate codes for Mandarin and Cantonese
     assert tokenize('谢谢谢谢', 'cmn') == tokens
     assert tokenize('谢谢谢谢', 'yue') == tokens
+
+
+def test_unreasonably_long():
+    # This crashed earlier versions of wordfreq
+    lots_of_ls = 'l' * 800
+    assert word_frequency(lots_of_ls, 'zh') < 1e-300
+    assert zipf_frequency(lots_of_ls, 'zh') == 0.
+
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index bad4c92..17c910a 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
         # If we used the Jieba tokenizer, we could tokenize anything to match
         # our wordlist, even nonsense. To counteract this, we multiply by a
         # probability for each word break that was inferred.
-        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
+        freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
 
     # All our frequency data is only precise to within 1% anyway, so round
     # it to 3 significant digits