From 13642d6a4d53d8a00860e7d4bd5b6599c65197cd Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 16:46:07 -0400 Subject: [PATCH] replace the literal 10 with the constant INFERRED_SPACE_FACTOR Former-commit-id: 7a3ea2bf796c3f31fdf7d1c441b12b8ec52acf50 --- wordfreq/__init__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 4790282..85e4711 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -21,6 +21,14 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) # for the fact that token boundaries were inferred. SPACELESS_LANGUAGES = {'zh', 'ja'} +# We'll divide the frequency by 10 for each token boundary that was inferred. +# (We determined the factor of 10 empirically by looking at words in the +# Chinese wordlist that weren't common enough to be identified by the +# tokenizer. These words would get split into multiple tokens, and their +# inferred frequency would be on average 9.77 times higher than their actual +# frequency.) +INFERRED_SPACE_FACTOR = 10.0 + # simple_tokenize is imported so that other things can import it from here. # Suppress the pyflakes warning. simple_tokenize = simple_tokenize @@ -190,13 +198,7 @@ def _word_frequency(word, lang, wordlist, minimum): freq = 1.0 / one_over_result if lang in SPACELESS_LANGUAGES: - # Divide the frequency by 10 for each token boundary that was inferred. - # (We determined the factor of 10 empirically by looking at words in - # the Chinese wordlist that weren't common enough to be identified by - # the tokenizer. These words would get split into multiple tokens, and - # their inferred frequency would be on average 9.77 times higher than - # their actual frequency.) - freq /= 10 ** (len(tokens) - 1) + freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) return max(freq, minimum)