Lower the frequency of phrases with inferred token boundaries

Former-commit-id: 5c8c36f4e3
This commit is contained in:
Rob Speer 2015-09-10 14:16:22 -04:00
parent 37e5e1009f
commit a13f459f88
4 changed files with 24 additions and 7 deletions

View File

@ -190,10 +190,12 @@ into multiple tokens:
>>> word_frequency('New York', 'en')
0.0002315934248950231
>>> word_frequency('北京地铁', 'zh') # "Beijing Subway"
2.342123813395707e-05
3.2187603965715087e-06
The word frequencies are combined with the half-harmonic-mean function in order
to provide an estimate of what their combined frequency would be.
to provide an estimate of what their combined frequency would be. In languages
written without spaces, there is also a penalty to the word frequency for each
word break that must be inferred.
This implicitly assumes that you're asking about words that frequently appear
together. It's not multiplying the frequencies, because that would assume they

View File

@ -43,6 +43,5 @@ def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert_almost_equal(
word_frequency('谢谢谢谢', 'zh'),
xiexie_freq / 2
xiexie_freq / 20
)

View File

@ -14,10 +14,10 @@ def test_combination():
assert_almost_equal(
word_frequency('おはようおはよう', 'ja'),
ohayou_freq / 2
ohayou_freq / 20
)
assert_almost_equal(
1.0 / word_frequency('おはようございます', 'ja'),
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
)

View File

@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
# Chinese and Japanese are written without spaces. This means we have to
# run language-specific code to infer token boundaries on them, and also
# that we need to adjust frequencies of multi-token phrases to account
# for the fact that token boundaries were inferred.
SPACELESS_LANGUAGES = {'zh', 'ja'}
# simple_tokenize is imported so that other things can import it from here.
# Suppress the pyflakes warning.
@ -181,7 +186,18 @@ def _word_frequency(word, lang, wordlist, minimum):
return minimum
one_over_result += 1.0 / freqs[token]
return max(1.0 / one_over_result, minimum)
freq = 1.0 / one_over_result
if lang in SPACELESS_LANGUAGES:
# Divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in
# the Chinese wordlist that weren't common enough to be identified by
# the tokenizer. These words would get split into multiple tokens, and
# their inferred frequency would be on average 9.77 times higher than
# their actual frequency.)
freq /= 10 ** (len(tokens) - 1)
return max(freq, minimum)
def word_frequency(word, lang, wordlist='combined', minimum=0.):
"""