From 4d00f17477e67ef42f98c6cad0b4ae0a83f06da8 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 24 Sep 2015 12:49:45 -0400
Subject: [PATCH] don't apply the inferred-space penalty to Japanese

Former-commit-id: db5eda605116e5441745cc6712abffea7f59a47b
---
 tests/test_japanese.py |  4 ++--
 wordfreq/__init__.py   | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_japanese.py b/tests/test_japanese.py
index af05c2a..9906741 100644
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@@ -14,10 +14,10 @@ def test_combination():
 
     assert_almost_equal(
         word_frequency('おはようおはよう', 'ja'),
-        ohayou_freq / 20
+        ohayou_freq / 2
     )
     assert_almost_equal(
         1.0 / word_frequency('おはようございます', 'ja'),
-        (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
+        (1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
     )
 
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 85e4711..e6a4849 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -15,11 +15,11 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
-# Chinese and Japanese are written without spaces. This means we have to
-# run language-specific code to infer token boundaries on them, and also
-# that we need to adjust frequencies of multi-token phrases to account
-# for the fact that token boundaries were inferred.
-SPACELESS_LANGUAGES = {'zh', 'ja'}
+# Chinese and Japanese are written without spaces. In Chinese, in particular,
+# we have to infer word boundaries from the frequencies of the words they
+# would create. When this happens, we should adjust the resulting frequency
+# to avoid creating a bias toward improbable word combinations.
+INFERRED_SPACE_LANGUAGES = {'zh'}
 
 # We'll divide the frequency by 10 for each token boundary that was inferred.
 # (We determined the factor of 10 empirically by looking at words in the
@@ -197,7 +197,7 @@ def _word_frequency(word, lang, wordlist, minimum):
 
     freq = 1.0 / one_over_result
 
-    if lang in SPACELESS_LANGUAGES:
+    if lang in INFERRED_SPACE_LANGUAGES:
         freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
 
     return max(freq, minimum)