Traditional Chinese should be preserved through tokenization

2024-12-24 09:51:38 +00:00 · 2018-03-08 18:08:55 -05:00 · 2018-03-08 18:08:55 -05:00 · 47dac3b0b8
commit 47dac3b0b8
parent 5a5acec9ff
2 changed files with 19 additions and 3 deletions
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -55,10 +55,19 @@ def test_tokens():
        ]
    )
-    # You match the same tokens if you look it up in Traditional Chinese.
+    # Check that Traditional Chinese works at all
    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
    # You get the same token lengths if you look it up in Traditional Chinese,
    # but the words are different
    simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
    trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
    eq_(''.join(simp_tokens), fact_simplified)
    eq_(''.join(trad_tokens), fact_traditional)
    simp_lengths = [len(token) for token in simp_tokens]
    trad_lengths = [len(token) for token in trad_tokens]
    eq_(simp_lengths, trad_lengths)
 def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -49,4 +49,11 @@ def jieba_tokenize(text, external_wordlist=False):
    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
-        return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
+
        # Tokenize the Simplified Chinese version of the text, but return
        # those spans from the original text, even if it's in Traditional
        # Chinese
        tokens = []
        for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
            tokens.append(text[start:end])
        return tokens