wordfreq/wordfreq/chinese.py

from __future__ import annotations

import gzip

import jieba
import msgpack

from .util import data_path

DICT_FILENAME = data_path("jieba_zh.txt")
ORIG_DICT_FILENAME = data_path("jieba_zh_orig.txt")
SIMP_MAP_FILENAME = data_path("_chinese_mapping.msgpack.gz")
try:
    SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
except TypeError:
    # work around incompatibility between pure-Python msgpack and C msgpack
    SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
jieba_tokenizer: jieba.Tokenizer | None = None
jieba_orig_tokenizer: jieba.Tokenizer | None = None


def simplify_chinese(text: str) -> str:
    """
    Convert Chinese text character-by-character to Simplified Chinese, for the
    purpose of looking up word frequencies.

    This is far too simple to be a proper Chinese-to-Chinese "translation"; it
    will sometimes produce nonsense words by simplifying characters that would
    not be simplified in context, or by simplifying words that would only be
    used in a Traditional Chinese locale. But the resulting text is still a
    reasonable key for looking up word frequenices.
    """
    return text.translate(SIMPLIFIED_MAP).casefold()


def jieba_tokenize(text: str, external_wordlist: bool = False) -> list[str]:
    """
    Tokenize the given text into tokens whose word frequencies can probably
    be looked up. This uses Jieba, a word-frequency-based tokenizer.

    If `external_wordlist` is False, we tell Jieba to default to using
    wordfreq's own Chinese wordlist, and not to infer unknown words using a
    hidden Markov model. This ensures that the multi-character tokens that it
    outputs will be ones whose word frequencies we can look up.

    If `external_wordlist` is True, this will use the largest version of
    Jieba's original dictionary, with HMM enabled, so its results will be
    independent of the data in wordfreq. These results will be better optimized
    for purposes that aren't looking up word frequencies, such as general-
    purpose tokenization, or collecting word frequencies in the first place.
    """
    global jieba_tokenizer, jieba_orig_tokenizer
    if external_wordlist:
        if jieba_orig_tokenizer is None:
            jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
        return jieba_orig_tokenizer.lcut(text)
    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)

        # Tokenize the Simplified Chinese version of the text, but return
        # those spans from the original text, even if it's in Traditional
        # Chinese
        tokens = []
        for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
            tokens.append(text[start:end])
        return tokens