don't apply the inferred-space penalty to Japanese

2024-12-23 17:31:41 +00:00 · 2015-09-24 12:49:45 -04:00 · 2015-09-24 12:49:45 -04:00 · db5eda6051
commit db5eda6051
parent bb70bdba58
2 changed files with 8 additions and 8 deletions
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -14,10 +14,10 @@ def test_combination():

    assert_almost_equal(
        word_frequency('おはようおはよう', 'ja'),
-        ohayou_freq / 20
+        ohayou_freq / 2
    )
    assert_almost_equal(
        1.0 / word_frequency('おはようございます', 'ja'),
-        (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
+        (1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
    )

--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -15,11 +15,11 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

-# Chinese and Japanese are written without spaces. This means we have to
-# run language-specific code to infer token boundaries on them, and also
-# that we need to adjust frequencies of multi-token phrases to account
-# for the fact that token boundaries were inferred.
-SPACELESS_LANGUAGES = {'zh', 'ja'}
+# Chinese and Japanese are written without spaces. In Chinese, in particular,
+# we have to infer word boundaries from the frequencies of the words they
+# would create. When this happens, we should adjust the resulting frequency
+# to avoid creating a bias toward improbable word combinations.
+INFERRED_SPACE_LANGUAGES = {'zh'}

 # We'll divide the frequency by 10 for each token boundary that was inferred.
 # (We determined the factor of 10 empirically by looking at words in the
@ -197,7 +197,7 @@ def _word_frequency(word, lang, wordlist, minimum):

    freq = 1.0 / one_over_result

-    if lang in SPACELESS_LANGUAGES:
+    if lang in INFERRED_SPACE_LANGUAGES:
        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)

    return max(freq, minimum)