mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
change math for INFERRED_SPACE_FACTOR to not overflow
This commit is contained in:
parent
7318f58df9
commit
75a56b68fb
@ -77,3 +77,11 @@ def test_alternate_codes():
|
||||
# Separate codes for Mandarin and Cantonese
|
||||
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
||||
assert tokenize('谢谢谢谢', 'yue') == tokens
|
||||
|
||||
|
||||
def test_unreasonably_long():
|
||||
# This crashed earlier versions of wordfreq
|
||||
lots_of_ls = 'l' * 800
|
||||
assert word_frequency(lots_of_ls, 'zh') < 1e-300
|
||||
assert zipf_frequency(lots_of_ls, 'zh') == 0.
|
||||
|
||||
|
@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
# If we used the Jieba tokenizer, we could tokenize anything to match
|
||||
# our wordlist, even nonsense. To counteract this, we multiply by a
|
||||
# probability for each word break that was inferred.
|
||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||
freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
|
||||
|
||||
# All our frequency data is only precise to within 1% anyway, so round
|
||||
# it to 3 significant digits
|
||||
|
Loading…
Reference in New Issue
Block a user