From 4d00f17477e67ef42f98c6cad0b4ae0a83f06da8 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 24 Sep 2015 12:49:45 -0400 Subject: [PATCH] don't apply the inferred-space penalty to Japanese Former-commit-id: db5eda605116e5441745cc6712abffea7f59a47b --- tests/test_japanese.py | 4 ++-- wordfreq/__init__.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_japanese.py b/tests/test_japanese.py index af05c2a..9906741 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -14,10 +14,10 @@ def test_combination(): assert_almost_equal( word_frequency('おはようおはよう', 'ja'), - ohayou_freq / 20 + ohayou_freq / 2 ) assert_almost_equal( 1.0 / word_frequency('おはようございます', 'ja'), - (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq) + (1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq) ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 85e4711..e6a4849 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -15,11 +15,11 @@ logger = logging.getLogger(__name__) CACHE_SIZE = 100000 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) -# Chinese and Japanese are written without spaces. This means we have to -# run language-specific code to infer token boundaries on them, and also -# that we need to adjust frequencies of multi-token phrases to account -# for the fact that token boundaries were inferred. -SPACELESS_LANGUAGES = {'zh', 'ja'} +# Chinese and Japanese are written without spaces. In Chinese, in particular, +# we have to infer word boundaries from the frequencies of the words they +# would create. When this happens, we should adjust the resulting frequency +# to avoid creating a bias toward improbable word combinations. +INFERRED_SPACE_LANGUAGES = {'zh'} # We'll divide the frequency by 10 for each token boundary that was inferred. # (We determined the factor of 10 empirically by looking at words in the @@ -197,7 +197,7 @@ def _word_frequency(word, lang, wordlist, minimum): freq = 1.0 / one_over_result - if lang in SPACELESS_LANGUAGES: + if lang in INFERRED_SPACE_LANGUAGES: freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) return max(freq, minimum)