mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Lower the frequency of phrases with inferred token boundaries
Former-commit-id: 5c8c36f4e3
This commit is contained in:
parent
37e5e1009f
commit
a13f459f88
@ -190,10 +190,12 @@ into multiple tokens:
|
||||
>>> word_frequency('New York', 'en')
|
||||
0.0002315934248950231
|
||||
>>> word_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||
2.342123813395707e-05
|
||||
3.2187603965715087e-06
|
||||
|
||||
The word frequencies are combined with the half-harmonic-mean function in order
|
||||
to provide an estimate of what their combined frequency would be.
|
||||
to provide an estimate of what their combined frequency would be. In languages
|
||||
written without spaces, there is also a penalty to the word frequency for each
|
||||
word break that must be inferred.
|
||||
|
||||
This implicitly assumes that you're asking about words that frequently appear
|
||||
together. It's not multiplying the frequencies, because that would assume they
|
||||
|
@ -43,6 +43,5 @@ def test_combination():
|
||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||
assert_almost_equal(
|
||||
word_frequency('谢谢谢谢', 'zh'),
|
||||
xiexie_freq / 2
|
||||
xiexie_freq / 20
|
||||
)
|
||||
|
||||
|
@ -14,10 +14,10 @@ def test_combination():
|
||||
|
||||
assert_almost_equal(
|
||||
word_frequency('おはようおはよう', 'ja'),
|
||||
ohayou_freq / 2
|
||||
ohayou_freq / 20
|
||||
)
|
||||
assert_almost_equal(
|
||||
1.0 / word_frequency('おはようございます', 'ja'),
|
||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
||||
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
|
||||
)
|
||||
|
||||
|
@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
|
||||
CACHE_SIZE = 100000
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
# Chinese and Japanese are written without spaces. This means we have to
|
||||
# run language-specific code to infer token boundaries on them, and also
|
||||
# that we need to adjust frequencies of multi-token phrases to account
|
||||
# for the fact that token boundaries were inferred.
|
||||
SPACELESS_LANGUAGES = {'zh', 'ja'}
|
||||
|
||||
# simple_tokenize is imported so that other things can import it from here.
|
||||
# Suppress the pyflakes warning.
|
||||
@ -181,7 +186,18 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
return minimum
|
||||
one_over_result += 1.0 / freqs[token]
|
||||
|
||||
return max(1.0 / one_over_result, minimum)
|
||||
freq = 1.0 / one_over_result
|
||||
|
||||
if lang in SPACELESS_LANGUAGES:
|
||||
# Divide the frequency by 10 for each token boundary that was inferred.
|
||||
# (We determined the factor of 10 empirically by looking at words in
|
||||
# the Chinese wordlist that weren't common enough to be identified by
|
||||
# the tokenizer. These words would get split into multiple tokens, and
|
||||
# their inferred frequency would be on average 9.77 times higher than
|
||||
# their actual frequency.)
|
||||
freq /= 10 ** (len(tokens) - 1)
|
||||
|
||||
return max(freq, minimum)
|
||||
|
||||
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user