2021-02-18 20:09:31 +00:00
|
|
|
from wordfreq import tokenize, word_frequency, zipf_frequency
|
2018-06-01 20:33:06 +00:00
|
|
|
import pytest
|
2015-09-05 07:16:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_tokens():
|
|
|
|
# Let's test on some Chinese text that has unusual combinations of
|
|
|
|
# syllables, because it is about an American vice-president.
|
|
|
|
#
|
|
|
|
# (He was the Chinese Wikipedia's featured article of the day when I
|
|
|
|
# wrote this test.)
|
|
|
|
|
2022-03-10 23:33:42 +00:00
|
|
|
hobart = "加勒特·霍巴特" # Garret Hobart, or "jiā lè tè huò bā tè".
|
2015-09-05 07:16:56 +00:00
|
|
|
|
|
|
|
# He was the sixth American vice president to die in office.
|
2022-03-10 23:33:42 +00:00
|
|
|
fact_simplified = "他是历史上第六位在任期内去世的美国副总统。"
|
|
|
|
fact_traditional = "他是歷史上第六位在任期內去世的美國副總統。"
|
2015-09-05 07:16:56 +00:00
|
|
|
|
|
|
|
# His name breaks into five pieces, with the only piece staying together
|
|
|
|
# being the one that means 'Bart'. The dot is not included as a token.
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize(hobart, "zh") == ["加", "勒", "特", "霍", "巴特"]
|
2018-06-01 20:33:06 +00:00
|
|
|
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize(fact_simplified, "zh") == [
|
2018-06-01 20:33:06 +00:00
|
|
|
# he / is / history / in / #6 / counter for people
|
2022-03-10 23:33:42 +00:00
|
|
|
"他",
|
|
|
|
"是",
|
|
|
|
"历史",
|
|
|
|
"上",
|
|
|
|
"第六",
|
|
|
|
"位",
|
2018-06-01 20:33:06 +00:00
|
|
|
# during / term of office / in / die
|
2022-03-10 23:33:42 +00:00
|
|
|
"在",
|
|
|
|
"任期",
|
|
|
|
"内",
|
|
|
|
"去世",
|
2018-06-01 20:33:06 +00:00
|
|
|
# of / U.S. / deputy / president
|
2022-03-10 23:33:42 +00:00
|
|
|
"的",
|
|
|
|
"美国",
|
|
|
|
"副",
|
|
|
|
"总统",
|
2018-06-01 20:33:06 +00:00
|
|
|
]
|
2015-09-10 22:09:41 +00:00
|
|
|
|
2015-09-24 17:41:11 +00:00
|
|
|
# Jieba's original tokenizer knows a lot of names, it seems.
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize(hobart, "zh", external_wordlist=True) == ["加勒特", "霍巴特"]
|
2015-09-10 22:09:41 +00:00
|
|
|
|
|
|
|
# We get almost the same tokens from the sentence using Jieba's own
|
|
|
|
# wordlist, but it tokenizes "in history" as two words and
|
|
|
|
# "sixth person" as one.
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize(fact_simplified, "zh", external_wordlist=True) == [
|
2018-06-01 20:33:06 +00:00
|
|
|
# he / is / history / in / sixth person
|
2022-03-10 23:33:42 +00:00
|
|
|
"他",
|
|
|
|
"是",
|
|
|
|
"历史",
|
|
|
|
"上",
|
|
|
|
"第六位",
|
2018-06-01 20:33:06 +00:00
|
|
|
# during / term of office / in / die
|
2022-03-10 23:33:42 +00:00
|
|
|
"在",
|
|
|
|
"任期",
|
|
|
|
"内",
|
|
|
|
"去世",
|
2018-06-01 20:33:06 +00:00
|
|
|
# of / U.S. / deputy / president
|
2022-03-10 23:33:42 +00:00
|
|
|
"的",
|
|
|
|
"美国",
|
|
|
|
"副",
|
|
|
|
"总统",
|
2018-06-01 20:33:06 +00:00
|
|
|
]
|
2015-09-05 07:16:56 +00:00
|
|
|
|
2018-03-08 23:08:55 +00:00
|
|
|
# Check that Traditional Chinese works at all
|
2022-03-10 23:33:42 +00:00
|
|
|
assert word_frequency(fact_traditional, "zh") > 0
|
2015-09-05 07:16:56 +00:00
|
|
|
|
2018-03-08 23:08:55 +00:00
|
|
|
# You get the same token lengths if you look it up in Traditional Chinese,
|
|
|
|
# but the words are different
|
2022-03-10 23:33:42 +00:00
|
|
|
simp_tokens = tokenize(fact_simplified, "zh", include_punctuation=True)
|
|
|
|
trad_tokens = tokenize(fact_traditional, "zh", include_punctuation=True)
|
|
|
|
assert "".join(simp_tokens) == fact_simplified
|
|
|
|
assert "".join(trad_tokens) == fact_traditional
|
2018-03-08 23:08:55 +00:00
|
|
|
simp_lengths = [len(token) for token in simp_tokens]
|
|
|
|
trad_lengths = [len(token) for token in trad_tokens]
|
2018-06-01 20:33:06 +00:00
|
|
|
assert simp_lengths == trad_lengths
|
2018-03-08 23:08:55 +00:00
|
|
|
|
2015-09-05 07:16:56 +00:00
|
|
|
|
|
|
|
def test_combination():
|
2022-03-10 23:33:42 +00:00
|
|
|
xiexie_freq = word_frequency("谢谢", "zh") # "Thanks"
|
|
|
|
assert word_frequency("谢谢谢谢", "zh") == pytest.approx(xiexie_freq / 20, rel=0.01)
|
2017-04-27 19:09:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_alternate_codes():
|
|
|
|
# Tokenization of Chinese works when you use other language codes
|
|
|
|
# that are not equal to 'zh'.
|
2022-03-10 23:33:42 +00:00
|
|
|
tokens = ["谢谢", "谢谢"]
|
2017-04-27 19:09:59 +00:00
|
|
|
|
|
|
|
# Code with a region attached
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize("谢谢谢谢", "zh-CN") == tokens
|
2017-04-27 19:09:59 +00:00
|
|
|
|
|
|
|
# Over-long codes for Chinese
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize("谢谢谢谢", "chi") == tokens
|
|
|
|
assert tokenize("谢谢谢谢", "zho") == tokens
|
2017-04-27 19:09:59 +00:00
|
|
|
|
|
|
|
# Separate codes for Mandarin and Cantonese
|
2022-03-10 23:33:42 +00:00
|
|
|
assert tokenize("谢谢谢谢", "cmn") == tokens
|
|
|
|
assert tokenize("谢谢谢谢", "yue") == tokens
|
2021-02-18 19:44:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_unreasonably_long():
|
2021-02-18 20:09:31 +00:00
|
|
|
# This crashed earlier versions of wordfreq due to an overflow in
|
|
|
|
# exponentiation. We've now changed the sequence of operations so it
|
|
|
|
# will underflow instead.
|
2022-03-10 23:33:42 +00:00
|
|
|
lots_of_ls = "l" * 800
|
|
|
|
assert word_frequency(lots_of_ls, "zh") == 0.0
|
|
|
|
assert zipf_frequency(lots_of_ls, "zh") == 0.0
|
2021-02-18 19:44:39 +00:00
|
|
|
|
2021-09-02 18:13:53 +00:00
|
|
|
|
2020-09-08 20:03:33 +00:00
|
|
|
def test_hyphens():
|
|
|
|
# An edge case of Chinese tokenization that changed sometime around
|
|
|
|
# jieba 0.42.
|
|
|
|
|
2022-03-10 23:33:42 +00:00
|
|
|
tok = tokenize("--------", "zh", include_punctuation=True)
|
|
|
|
assert tok == ["-"] * 8
|
2020-09-08 20:03:33 +00:00
|
|
|
|
2022-03-10 23:33:42 +00:00
|
|
|
tok = tokenize("--------", "zh", include_punctuation=True, external_wordlist=True)
|
|
|
|
assert tok == ["--------"]
|