wordfreq/tests/test_chinese.py

import pytest
from wordfreq import tokenize, word_frequency, zipf_frequency


def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = "加勒特·霍巴特"  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified = "他是历史上第六位在任期内去世的美国副总统。"
    fact_traditional = "他是歷史上第六位在任期內去世的美國副總統。"

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    assert tokenize(hobart, "zh") == ["加", "勒", "特", "霍", "巴特"]

    assert tokenize(fact_simplified, "zh") == [
        # he / is / history / in / #6 / counter for people
        "他",
        "是",
        "历史",
        "上",
        "第六",
        "位",
        # during / term of office / in / die
        "在",
        "任期",
        "内",
        "去世",
        # of / U.S. / deputy / president
        "的",
        "美国",
        "副",
        "总统",
    ]

    # Jieba's original tokenizer knows a lot of names, it seems.
    assert tokenize(hobart, "zh", external_wordlist=True) == ["加勒特", "霍巴特"]

    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    assert tokenize(fact_simplified, "zh", external_wordlist=True) == [
        # he / is / history / in / sixth person
        "他",
        "是",
        "历史",
        "上",
        "第六位",
        # during / term of office / in / die
        "在",
        "任期",
        "内",
        "去世",
        # of / U.S. / deputy / president
        "的",
        "美国",
        "副",
        "总统",
    ]

    # Check that Traditional Chinese works at all
    assert word_frequency(fact_traditional, "zh") > 0

    # You get the same token lengths if you look it up in Traditional Chinese,
    # but the words are different
    simp_tokens = tokenize(fact_simplified, "zh", include_punctuation=True)
    trad_tokens = tokenize(fact_traditional, "zh", include_punctuation=True)
    assert "".join(simp_tokens) == fact_simplified
    assert "".join(trad_tokens) == fact_traditional
    simp_lengths = [len(token) for token in simp_tokens]
    trad_lengths = [len(token) for token in trad_tokens]
    assert simp_lengths == trad_lengths


def test_combination():
    xiexie_freq = word_frequency("谢谢", "zh")  # "Thanks"
    assert word_frequency("谢谢谢谢", "zh") == pytest.approx(xiexie_freq / 20, rel=0.01)


def test_alternate_codes():
    # Tokenization of Chinese works when you use other language codes
    # that are not equal to 'zh'.
    tokens = ["谢谢", "谢谢"]

    # Code with a region attached
    assert tokenize("谢谢谢谢", "zh-CN") == tokens

    # Over-long codes for Chinese
    assert tokenize("谢谢谢谢", "chi") == tokens
    assert tokenize("谢谢谢谢", "zho") == tokens

    # Separate codes for Mandarin and Cantonese
    assert tokenize("谢谢谢谢", "cmn") == tokens
    assert tokenize("谢谢谢谢", "yue") == tokens


def test_unreasonably_long():
    # This crashed earlier versions of wordfreq due to an overflow in
    # exponentiation. We've now changed the sequence of operations so it
    # will underflow instead.
    lots_of_ls = "l" * 800
    assert word_frequency(lots_of_ls, "zh") == 0.0
    assert zipf_frequency(lots_of_ls, "zh") == 0.0


def test_hyphens():
    # An edge case of Chinese tokenization that changed sometime around
    # jieba 0.42.

    tok = tokenize("--------", "zh", include_punctuation=True)
    assert tok == ["-"] * 8

    tok = tokenize("--------", "zh", include_punctuation=True, external_wordlist=True)
    assert tok == ["--------"]
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`import pytest`
v3.1: support py3.12, update formatting, replace pkg_resources with locate 2023-11-21 23:07:04 +00:00			`from wordfreq import tokenize, word_frequency, zipf_frequency`
tokenize Chinese using jieba and our own frequencies Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628 2015-09-05 07:16:56 +00:00

			`def test_tokens():`
			`# Let's test on some Chinese text that has unusual combinations of`
			`# syllables, because it is about an American vice-president.`
			`#`
			`# (He was the Chinese Wikipedia's featured article of the day when I`
			`# wrote this test.)`

estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`hobart = "加勒特·霍巴特" # Garret Hobart, or "jiā lè tè huò bā tè".`
tokenize Chinese using jieba and our own frequencies Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628 2015-09-05 07:16:56 +00:00
			`# He was the sixth American vice president to die in office.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`fact_simplified = "他是历史上第六位在任期内去世的美国副总统。"`
			`fact_traditional = "他是歷史上第六位在任期內去世的美國副總統。"`
tokenize Chinese using jieba and our own frequencies Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628 2015-09-05 07:16:56 +00:00
			`# His name breaks into five pieces, with the only piece staying together`
			`# being the one that means 'Bart'. The dot is not included as a token.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize(hobart, "zh") == ["加", "勒", "特", "霍", "巴特"]`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize(fact_simplified, "zh") == [`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`# he / is / history / in / #6 / counter for people`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"他",`
			`"是",`
			`"历史",`
			`"上",`
			`"第六",`
			`"位",`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`# during / term of office / in / die`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"在",`
			`"任期",`
			`"内",`
			`"去世",`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`# of / U.S. / deputy / president`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"的",`
			`"美国",`
			`"副",`
			`"总统",`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`]`
add `external_wordlist` option to tokenize Former-commit-id: 669bd16c13676cccd881920b22e8cf53d9803022 2015-09-10 22:09:41 +00:00
test_chinese: fix typo in comment Former-commit-id: 2a84a926f57f1ec7e1fddd72eb94f890f50e2bee 2015-09-24 17:41:11 +00:00			`# Jieba's original tokenizer knows a lot of names, it seems.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize(hobart, "zh", external_wordlist=True) == ["加勒特", "霍巴特"]`
add `external_wordlist` option to tokenize Former-commit-id: 669bd16c13676cccd881920b22e8cf53d9803022 2015-09-10 22:09:41 +00:00
			`# We get almost the same tokens from the sentence using Jieba's own`
			`# wordlist, but it tokenizes "in history" as two words and`
			`# "sixth person" as one.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize(fact_simplified, "zh", external_wordlist=True) == [`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`# he / is / history / in / sixth person`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"他",`
			`"是",`
			`"历史",`
			`"上",`
			`"第六位",`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`# during / term of office / in / die`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"在",`
			`"任期",`
			`"内",`
			`"去世",`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`# of / U.S. / deputy / president`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"的",`
			`"美国",`
			`"副",`
			`"总统",`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`]`
tokenize Chinese using jieba and our own frequencies Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628 2015-09-05 07:16:56 +00:00
Traditional Chinese should be preserved through tokenization 2018-03-08 23:08:55 +00:00			`# Check that Traditional Chinese works at all`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert word_frequency(fact_traditional, "zh") > 0`
tokenize Chinese using jieba and our own frequencies Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628 2015-09-05 07:16:56 +00:00
Traditional Chinese should be preserved through tokenization 2018-03-08 23:08:55 +00:00			`# You get the same token lengths if you look it up in Traditional Chinese,`
			`# but the words are different`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`simp_tokens = tokenize(fact_simplified, "zh", include_punctuation=True)`
			`trad_tokens = tokenize(fact_traditional, "zh", include_punctuation=True)`
			`assert "".join(simp_tokens) == fact_simplified`
			`assert "".join(trad_tokens) == fact_traditional`
Traditional Chinese should be preserved through tokenization 2018-03-08 23:08:55 +00:00			`simp_lengths = [len(token) for token in simp_tokens]`
			`trad_lengths = [len(token) for token in trad_tokens]`
port test.py and test_chinese.py to pytest 2018-06-01 20:33:06 +00:00			`assert simp_lengths == trad_lengths`
Traditional Chinese should be preserved through tokenization 2018-03-08 23:08:55 +00:00
tokenize Chinese using jieba and our own frequencies Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628 2015-09-05 07:16:56 +00:00
			`def test_combination():`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`xiexie_freq = word_frequency("谢谢", "zh") # "Thanks"`
			`assert word_frequency("谢谢谢谢", "zh") == pytest.approx(xiexie_freq / 20, rel=0.01)`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00

			`def test_alternate_codes():`
			`# Tokenization of Chinese works when you use other language codes`
			`# that are not equal to 'zh'.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`tokens = ["谢谢", "谢谢"]`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00
			`# Code with a region attached`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("谢谢谢谢", "zh-CN") == tokens`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00
			`# Over-long codes for Chinese`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("谢谢谢谢", "chi") == tokens`
			`assert tokenize("谢谢谢谢", "zho") == tokens`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00
			`# Separate codes for Mandarin and Cantonese`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("谢谢谢谢", "cmn") == tokens`
			`assert tokenize("谢谢谢谢", "yue") == tokens`
change math for INFERRED_SPACE_FACTOR to not overflow 2021-02-18 19:44:39 +00:00

			`def test_unreasonably_long():`
specifically test that the long sequence underflows to 0 2021-02-18 20:09:31 +00:00			`# This crashed earlier versions of wordfreq due to an overflow in`
			`# exponentiation. We've now changed the sequence of operations so it`
			`# will underflow instead.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`lots_of_ls = "l" * 800`
			`assert word_frequency(lots_of_ls, "zh") == 0.0`
			`assert zipf_frequency(lots_of_ls, "zh") == 0.0`
change math for INFERRED_SPACE_FACTOR to not overflow 2021-02-18 19:44:39 +00:00
Merge remote-tracking branch 'origin/apostrophe-consistency' 2021-09-02 18:13:53 +00:00
update dependencies and test for consistent results 2020-09-08 20:03:33 +00:00			`def test_hyphens():`
			`# An edge case of Chinese tokenization that changed sometime around`
			`# jieba 0.42.`

estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`tok = tokenize("--------", "zh", include_punctuation=True)`
			`assert tok == ["-"] * 8`
update dependencies and test for consistent results 2020-09-08 20:03:33 +00:00
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`tok = tokenize("--------", "zh", include_punctuation=True, external_wordlist=True)`
			`assert tok == ["--------"]`