diff --git a/tests/test_chinese.py b/tests/test_chinese.py index 25e6fe1..58df4a1 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -55,10 +55,19 @@ def test_tokens(): ] ) - # You match the same tokens if you look it up in Traditional Chinese. - eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh')) + # Check that Traditional Chinese works at all assert_greater(word_frequency(fact_traditional, 'zh'), 0) + # You get the same token lengths if you look it up in Traditional Chinese, + # but the words are different + simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True) + trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True) + eq_(''.join(simp_tokens), fact_simplified) + eq_(''.join(trad_tokens), fact_traditional) + simp_lengths = [len(token) for token in simp_tokens] + trad_lengths = [len(token) for token in trad_tokens] + eq_(simp_lengths, trad_lengths) + def test_combination(): xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index c57e937..9f7b95a 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -49,4 +49,11 @@ def jieba_tokenize(text, external_wordlist=False): else: if jieba_tokenizer is None: jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) - return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False) + + # Tokenize the Simplified Chinese version of the text, but return + # those spans from the original text, even if it's in Traditional + # Chinese + tokens = [] + for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False): + tokens.append(text[start:end]) + return tokens