Traditional Chinese should be preserved through tokenization

This commit is contained in:
Rob Speer 2018-03-08 18:08:55 -05:00
parent 5a5acec9ff
commit 47dac3b0b8
2 changed files with 19 additions and 3 deletions

View File

@ -55,10 +55,19 @@ def test_tokens():
]
)
# You match the same tokens if you look it up in Traditional Chinese.
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
# Check that Traditional Chinese works at all
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
# You get the same token lengths if you look it up in Traditional Chinese,
# but the words are different
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
eq_(''.join(simp_tokens), fact_simplified)
eq_(''.join(trad_tokens), fact_traditional)
simp_lengths = [len(token) for token in simp_tokens]
trad_lengths = [len(token) for token in trad_tokens]
eq_(simp_lengths, trad_lengths)
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"

View File

@ -49,4 +49,11 @@ def jieba_tokenize(text, external_wordlist=False):
else:
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
# Tokenize the Simplified Chinese version of the text, but return
# those spans from the original text, even if it's in Traditional
# Chinese
tokens = []
for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
tokens.append(text[start:end])
return tokens