wordfreq/tests/test_chinese.py

from wordfreq import tokenize, word_frequency, zipf_frequency
import pytest


def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']

    assert tokenize(fact_simplified, 'zh') == [
        # he / is / history / in / #6 / counter for people
        '他', '是',  '历史', '上', '第六', '位',
        # during / term of office / in / die
        '在', '任期', '内', '去世',
        # of / U.S. / deputy / president
        '的', '美国', '副', '总统'
    ]

    # Jieba's original tokenizer knows a lot of names, it seems.
    assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']

    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
        # he / is / history / in / sixth person
        '他', '是', '历史', '上', '第六位',
        # during / term of office / in / die
        '在', '任期', '内', '去世',
        # of / U.S. / deputy / president
        '的', '美国', '副', '总统'
    ]

    # Check that Traditional Chinese works at all
    assert word_frequency(fact_traditional, 'zh') > 0

    # You get the same token lengths if you look it up in Traditional Chinese,
    # but the words are different
    simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
    trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
    assert ''.join(simp_tokens) == fact_simplified
    assert ''.join(trad_tokens) == fact_traditional
    simp_lengths = [len(token) for token in simp_tokens]
    trad_lengths = [len(token) for token in trad_tokens]
    assert simp_lengths == trad_lengths


def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)


def test_alternate_codes():
    # Tokenization of Chinese works when you use other language codes
    # that are not equal to 'zh'.
    tokens = ['谢谢', '谢谢']

    # Code with a region attached
    assert tokenize('谢谢谢谢', 'zh-CN') == tokens

    # Over-long codes for Chinese
    assert tokenize('谢谢谢谢', 'chi') == tokens
    assert tokenize('谢谢谢谢', 'zho') == tokens

    # Separate codes for Mandarin and Cantonese
    assert tokenize('谢谢谢谢', 'cmn') == tokens
    assert tokenize('谢谢谢谢', 'yue') == tokens


def test_unreasonably_long():
    # This crashed earlier versions of wordfreq due to an overflow in
    # exponentiation. We've now changed the sequence of operations so it
    # will underflow instead.
    lots_of_ls = 'l' * 800
    assert word_frequency(lots_of_ls, 'zh') == 0.
    assert zipf_frequency(lots_of_ls, 'zh') == 0.


def test_hyphens():
    # An edge case of Chinese tokenization that changed sometime around
    # jieba 0.42.

    tok = tokenize('--------', 'zh', include_punctuation=True)
    assert tok == ['-'] * 8

    tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
    assert tok == ['--------']