Lower the frequency of phrases with inferred token boundaries

Former-commit-id: 5c8c36f4e3
2024-12-24 01:41:39 +00:00 · 2015-09-10 14:16:22 -04:00 · 2015-09-10 14:16:22 -04:00 · f0c7c3a02c
commit f0c7c3a02c
parent 3dd70ed1c2
4 changed files with 24 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -190,10 +190,12 @@ into multiple tokens:
    >>> word_frequency('New York', 'en')
    0.0002315934248950231
    >>> word_frequency('北京地铁', 'zh')  # "Beijing Subway"
-    2.342123813395707e-05
+    3.2187603965715087e-06
 The word frequencies are combined with the half-harmonic-mean function in order
-to provide an estimate of what their combined frequency would be.
+to provide an estimate of what their combined frequency would be. In languages
 written without spaces, there is also a penalty to the word frequency for each
 word break that must be inferred.
 This implicitly assumes that you're asking about words that frequently appear
 together. It's not multiplying the frequencies, because that would assume they
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -43,6 +43,5 @@ def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
    assert_almost_equal(
        word_frequency('谢谢谢谢', 'zh'),
-        xiexie_freq / 2
+        xiexie_freq / 20
    )
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -14,10 +14,10 @@ def test_combination():
    assert_almost_equal(
        word_frequency('おはようおはよう', 'ja'),
-        ohayou_freq / 2
+        ohayou_freq / 20
    )
    assert_almost_equal(
        1.0 / word_frequency('おはようございます', 'ja'),
-        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
+        (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 # Chinese and Japanese are written without spaces. This means we have to
 # run language-specific code to infer token boundaries on them, and also
 # that we need to adjust frequencies of multi-token phrases to account
 # for the fact that token boundaries were inferred.
 SPACELESS_LANGUAGES = {'zh', 'ja'}
 # simple_tokenize is imported so that other things can import it from here.
 # Suppress the pyflakes warning.
@ -181,7 +186,18 @@ def _word_frequency(word, lang, wordlist, minimum):
            return minimum
        one_over_result += 1.0 / freqs[token]
-    return max(1.0 / one_over_result, minimum)
+    freq = 1.0 / one_over_result
    if lang in SPACELESS_LANGUAGES:
        # Divide the frequency by 10 for each token boundary that was inferred.
        # (We determined the factor of 10 empirically by looking at words in
        # the Chinese wordlist that weren't common enough to be identified by
        # the tokenizer. These words would get split into multiple tokens, and
        # their inferred frequency would be on average 9.77 times higher than
        # their actual frequency.)
        freq /= 10 ** (len(tokens) - 1)
    return max(freq, minimum)
 def word_frequency(word, lang, wordlist='combined', minimum=0.):
    """