diff --git a/README.md b/README.md index 26b6f56..5ab4df2 100644 --- a/README.md +++ b/README.md @@ -190,10 +190,12 @@ into multiple tokens: >>> word_frequency('New York', 'en') 0.0002315934248950231 >>> word_frequency('北京地铁', 'zh') # "Beijing Subway" - 2.342123813395707e-05 + 3.2187603965715087e-06 The word frequencies are combined with the half-harmonic-mean function in order -to provide an estimate of what their combined frequency would be. +to provide an estimate of what their combined frequency would be. In languages +written without spaces, there is also a penalty to the word frequency for each +word break that must be inferred. This implicitly assumes that you're asking about words that frequently appear together. It's not multiplying the frequencies, because that would assume they diff --git a/tests/test_chinese.py b/tests/test_chinese.py index 32a6fe2..f1ae197 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -43,6 +43,5 @@ def test_combination(): xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" assert_almost_equal( word_frequency('谢谢谢谢', 'zh'), - xiexie_freq / 2 + xiexie_freq / 20 ) - diff --git a/tests/test_japanese.py b/tests/test_japanese.py index d5a73b3..af05c2a 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -14,10 +14,10 @@ def test_combination(): assert_almost_equal( word_frequency('おはようおはよう', 'ja'), - ohayou_freq / 2 + ohayou_freq / 20 ) assert_almost_equal( 1.0 / word_frequency('おはようございます', 'ja'), - 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq + (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq) ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index e939127..d5b95a2 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -15,6 +15,11 @@ logger = logging.getLogger(__name__) CACHE_SIZE = 100000 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) +# Chinese and Japanese are written without spaces. This means we have to +# run language-specific code to infer token boundaries on them, and also +# that we need to adjust frequencies of multi-token phrases to account +# for the fact that token boundaries were inferred. +SPACELESS_LANGUAGES = {'zh', 'ja'} # simple_tokenize is imported so that other things can import it from here. # Suppress the pyflakes warning. @@ -181,7 +186,18 @@ def _word_frequency(word, lang, wordlist, minimum): return minimum one_over_result += 1.0 / freqs[token] - return max(1.0 / one_over_result, minimum) + freq = 1.0 / one_over_result + + if lang in SPACELESS_LANGUAGES: + # Divide the frequency by 10 for each token boundary that was inferred. + # (We determined the factor of 10 empirically by looking at words in + # the Chinese wordlist that weren't common enough to be identified by + # the tokenizer. These words would get split into multiple tokens, and + # their inferred frequency would be on average 9.77 times higher than + # their actual frequency.) + freq /= 10 ** (len(tokens) - 1) + + return max(freq, minimum) def word_frequency(word, lang, wordlist='combined', minimum=0.): """