don't apply the inferred-space penalty to Japanese

This commit is contained in:
Rob Speer 2015-09-24 12:49:45 -04:00
parent bb70bdba58
commit db5eda6051
2 changed files with 8 additions and 8 deletions

View File

@ -14,10 +14,10 @@ def test_combination():
assert_almost_equal( assert_almost_equal(
word_frequency('おはようおはよう', 'ja'), word_frequency('おはようおはよう', 'ja'),
ohayou_freq / 20 ohayou_freq / 2
) )
assert_almost_equal( assert_almost_equal(
1.0 / word_frequency('おはようございます', 'ja'), 1.0 / word_frequency('おはようございます', 'ja'),
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq) (1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
) )

View File

@ -15,11 +15,11 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000 CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
# Chinese and Japanese are written without spaces. This means we have to # Chinese and Japanese are written without spaces. In Chinese, in particular,
# run language-specific code to infer token boundaries on them, and also # we have to infer word boundaries from the frequencies of the words they
# that we need to adjust frequencies of multi-token phrases to account # would create. When this happens, we should adjust the resulting frequency
# for the fact that token boundaries were inferred. # to avoid creating a bias toward improbable word combinations.
SPACELESS_LANGUAGES = {'zh', 'ja'} INFERRED_SPACE_LANGUAGES = {'zh'}
# We'll divide the frequency by 10 for each token boundary that was inferred. # We'll divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in the # (We determined the factor of 10 empirically by looking at words in the
@ -197,7 +197,7 @@ def _word_frequency(word, lang, wordlist, minimum):
freq = 1.0 / one_over_result freq = 1.0 / one_over_result
if lang in SPACELESS_LANGUAGES: if lang in INFERRED_SPACE_LANGUAGES:
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
return max(freq, minimum) return max(freq, minimum)