mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
Lower the frequency of phrases with inferred token boundaries
Former-commit-id: 5c8c36f4e3
This commit is contained in:
parent
3dd70ed1c2
commit
f0c7c3a02c
@ -190,10 +190,12 @@ into multiple tokens:
|
|||||||
>>> word_frequency('New York', 'en')
|
>>> word_frequency('New York', 'en')
|
||||||
0.0002315934248950231
|
0.0002315934248950231
|
||||||
>>> word_frequency('北京地铁', 'zh') # "Beijing Subway"
|
>>> word_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||||
2.342123813395707e-05
|
3.2187603965715087e-06
|
||||||
|
|
||||||
The word frequencies are combined with the half-harmonic-mean function in order
|
The word frequencies are combined with the half-harmonic-mean function in order
|
||||||
to provide an estimate of what their combined frequency would be.
|
to provide an estimate of what their combined frequency would be. In languages
|
||||||
|
written without spaces, there is also a penalty to the word frequency for each
|
||||||
|
word break that must be inferred.
|
||||||
|
|
||||||
This implicitly assumes that you're asking about words that frequently appear
|
This implicitly assumes that you're asking about words that frequently appear
|
||||||
together. It's not multiplying the frequencies, because that would assume they
|
together. It's not multiplying the frequencies, because that would assume they
|
||||||
|
@ -43,6 +43,5 @@ def test_combination():
|
|||||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
word_frequency('谢谢谢谢', 'zh'),
|
word_frequency('谢谢谢谢', 'zh'),
|
||||||
xiexie_freq / 2
|
xiexie_freq / 20
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -14,10 +14,10 @@ def test_combination():
|
|||||||
|
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
word_frequency('おはようおはよう', 'ja'),
|
word_frequency('おはようおはよう', 'ja'),
|
||||||
ohayou_freq / 2
|
ohayou_freq / 20
|
||||||
)
|
)
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
1.0 / word_frequency('おはようございます', 'ja'),
|
1.0 / word_frequency('おはようございます', 'ja'),
|
||||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
|
|||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
|
# Chinese and Japanese are written without spaces. This means we have to
|
||||||
|
# run language-specific code to infer token boundaries on them, and also
|
||||||
|
# that we need to adjust frequencies of multi-token phrases to account
|
||||||
|
# for the fact that token boundaries were inferred.
|
||||||
|
SPACELESS_LANGUAGES = {'zh', 'ja'}
|
||||||
|
|
||||||
# simple_tokenize is imported so that other things can import it from here.
|
# simple_tokenize is imported so that other things can import it from here.
|
||||||
# Suppress the pyflakes warning.
|
# Suppress the pyflakes warning.
|
||||||
@ -181,7 +186,18 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
return minimum
|
return minimum
|
||||||
one_over_result += 1.0 / freqs[token]
|
one_over_result += 1.0 / freqs[token]
|
||||||
|
|
||||||
return max(1.0 / one_over_result, minimum)
|
freq = 1.0 / one_over_result
|
||||||
|
|
||||||
|
if lang in SPACELESS_LANGUAGES:
|
||||||
|
# Divide the frequency by 10 for each token boundary that was inferred.
|
||||||
|
# (We determined the factor of 10 empirically by looking at words in
|
||||||
|
# the Chinese wordlist that weren't common enough to be identified by
|
||||||
|
# the tokenizer. These words would get split into multiple tokens, and
|
||||||
|
# their inferred frequency would be on average 9.77 times higher than
|
||||||
|
# their actual frequency.)
|
||||||
|
freq /= 10 ** (len(tokens) - 1)
|
||||||
|
|
||||||
|
return max(freq, minimum)
|
||||||
|
|
||||||
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user