mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Traditional Chinese should be preserved through tokenization
This commit is contained in:
parent
45064a292f
commit
8e3dff3c1c
@ -55,10 +55,19 @@ def test_tokens():
|
||||
]
|
||||
)
|
||||
|
||||
# You match the same tokens if you look it up in Traditional Chinese.
|
||||
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
|
||||
# Check that Traditional Chinese works at all
|
||||
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
|
||||
|
||||
# You get the same token lengths if you look it up in Traditional Chinese,
|
||||
# but the words are different
|
||||
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
|
||||
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
|
||||
eq_(''.join(simp_tokens), fact_simplified)
|
||||
eq_(''.join(trad_tokens), fact_traditional)
|
||||
simp_lengths = [len(token) for token in simp_tokens]
|
||||
trad_lengths = [len(token) for token in trad_tokens]
|
||||
eq_(simp_lengths, trad_lengths)
|
||||
|
||||
|
||||
def test_combination():
|
||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||
|
@ -49,4 +49,11 @@ def jieba_tokenize(text, external_wordlist=False):
|
||||
else:
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
|
||||
# Tokenize the Simplified Chinese version of the text, but return
|
||||
# those spans from the original text, even if it's in Traditional
|
||||
# Chinese
|
||||
tokens = []
|
||||
for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
|
||||
tokens.append(text[start:end])
|
||||
return tokens
|
||||
|
Loading…
Reference in New Issue
Block a user