mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
change math for INFERRED_SPACE_FACTOR to not overflow
This commit is contained in:
parent
02c3cbe3fb
commit
bd57b64d00
@ -77,3 +77,11 @@ def test_alternate_codes():
|
|||||||
# Separate codes for Mandarin and Cantonese
|
# Separate codes for Mandarin and Cantonese
|
||||||
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
||||||
assert tokenize('谢谢谢谢', 'yue') == tokens
|
assert tokenize('谢谢谢谢', 'yue') == tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_unreasonably_long():
|
||||||
|
# This crashed earlier versions of wordfreq
|
||||||
|
lots_of_ls = 'l' * 800
|
||||||
|
assert word_frequency(lots_of_ls, 'zh') < 1e-300
|
||||||
|
assert zipf_frequency(lots_of_ls, 'zh') == 0.
|
||||||
|
|
||||||
|
@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
# If we used the Jieba tokenizer, we could tokenize anything to match
|
# If we used the Jieba tokenizer, we could tokenize anything to match
|
||||||
# our wordlist, even nonsense. To counteract this, we multiply by a
|
# our wordlist, even nonsense. To counteract this, we multiply by a
|
||||||
# probability for each word break that was inferred.
|
# probability for each word break that was inferred.
|
||||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
|
||||||
|
|
||||||
# All our frequency data is only precise to within 1% anyway, so round
|
# All our frequency data is only precise to within 1% anyway, so round
|
||||||
# it to 3 significant digits
|
# it to 3 significant digits
|
||||||
|
Loading…
Reference in New Issue
Block a user