mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
don't apply the inferred-space penalty to Japanese
This commit is contained in:
parent
bb70bdba58
commit
db5eda6051
@ -14,10 +14,10 @@ def test_combination():
|
|||||||
|
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
word_frequency('おはようおはよう', 'ja'),
|
word_frequency('おはようおはよう', 'ja'),
|
||||||
ohayou_freq / 20
|
ohayou_freq / 2
|
||||||
)
|
)
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
1.0 / word_frequency('おはようございます', 'ja'),
|
1.0 / word_frequency('おはようございます', 'ja'),
|
||||||
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
|
(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -15,11 +15,11 @@ logger = logging.getLogger(__name__)
|
|||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
# Chinese and Japanese are written without spaces. This means we have to
|
# Chinese and Japanese are written without spaces. In Chinese, in particular,
|
||||||
# run language-specific code to infer token boundaries on them, and also
|
# we have to infer word boundaries from the frequencies of the words they
|
||||||
# that we need to adjust frequencies of multi-token phrases to account
|
# would create. When this happens, we should adjust the resulting frequency
|
||||||
# for the fact that token boundaries were inferred.
|
# to avoid creating a bias toward improbable word combinations.
|
||||||
SPACELESS_LANGUAGES = {'zh', 'ja'}
|
INFERRED_SPACE_LANGUAGES = {'zh'}
|
||||||
|
|
||||||
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
||||||
# (We determined the factor of 10 empirically by looking at words in the
|
# (We determined the factor of 10 empirically by looking at words in the
|
||||||
@ -197,7 +197,7 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
|
|
||||||
freq = 1.0 / one_over_result
|
freq = 1.0 / one_over_result
|
||||||
|
|
||||||
if lang in SPACELESS_LANGUAGES:
|
if lang in INFERRED_SPACE_LANGUAGES:
|
||||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||||
|
|
||||||
return max(freq, minimum)
|
return max(freq, minimum)
|
||||||
|
Loading…
Reference in New Issue
Block a user