Traditional Chinese should be preserved through tokenization

2024-12-23 09:21:37 +00:00 · 2018-03-08 18:08:55 -05:00 · 2018-03-08 18:08:55 -05:00 · 8e3dff3c1c
commit 8e3dff3c1c
parent 45064a292f
2 changed files with 19 additions and 3 deletions
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -55,10 +55,19 @@ def test_tokens():
        ]
    )

-    # You match the same tokens if you look it up in Traditional Chinese.
-    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
+    # Check that Traditional Chinese works at all
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)

+    # You get the same token lengths if you look it up in Traditional Chinese,
+    # but the words are different
+    simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
+    trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
+    eq_(''.join(simp_tokens), fact_simplified)
+    eq_(''.join(trad_tokens), fact_traditional)
+    simp_lengths = [len(token) for token in simp_tokens]
+    trad_lengths = [len(token) for token in trad_tokens]
+    eq_(simp_lengths, trad_lengths)
+

 def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -49,4 +49,11 @@ def jieba_tokenize(text, external_wordlist=False):
    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
-        return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
+
+        # Tokenize the Simplified Chinese version of the text, but return
+        # those spans from the original text, even if it's in Traditional
+        # Chinese
+        tokens = []
+        for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
+            tokens.append(text[start:end])
+        return tokens