change math for INFERRED_SPACE_FACTOR to not overflow

This commit is contained in:
Robyn Speer 2021-02-18 14:44:39 -05:00
parent 02c3cbe3fb
commit bd57b64d00
2 changed files with 9 additions and 1 deletions

View File

@ -77,3 +77,11 @@ def test_alternate_codes():
# Separate codes for Mandarin and Cantonese # Separate codes for Mandarin and Cantonese
assert tokenize('谢谢谢谢', 'cmn') == tokens assert tokenize('谢谢谢谢', 'cmn') == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens assert tokenize('谢谢谢谢', 'yue') == tokens
def test_unreasonably_long():
# This crashed earlier versions of wordfreq
lots_of_ls = 'l' * 800
assert word_frequency(lots_of_ls, 'zh') < 1e-300
assert zipf_frequency(lots_of_ls, 'zh') == 0.

View File

@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
# If we used the Jieba tokenizer, we could tokenize anything to match # If we used the Jieba tokenizer, we could tokenize anything to match
# our wordlist, even nonsense. To counteract this, we multiply by a # our wordlist, even nonsense. To counteract this, we multiply by a
# probability for each word break that was inferred. # probability for each word break that was inferred.
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
# All our frequency data is only precise to within 1% anyway, so round # All our frequency data is only precise to within 1% anyway, so round
# it to 3 significant digits # it to 3 significant digits