wordfreq/wordfreq/chinese.py

68 lines
2.8 KiB
Python

from __future__ import annotations
import gzip
import jieba
import msgpack
from .util import data_path
DICT_FILENAME = data_path("jieba_zh.txt")
ORIG_DICT_FILENAME = data_path("jieba_zh_orig.txt")
SIMP_MAP_FILENAME = data_path("_chinese_mapping.msgpack.gz")
try:
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
except TypeError:
# work around incompatibility between pure-Python msgpack and C msgpack
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
jieba_tokenizer: jieba.Tokenizer | None = None
jieba_orig_tokenizer: jieba.Tokenizer | None = None
def simplify_chinese(text: str) -> str:
"""
Convert Chinese text character-by-character to Simplified Chinese, for the
purpose of looking up word frequencies.
This is far too simple to be a proper Chinese-to-Chinese "translation"; it
will sometimes produce nonsense words by simplifying characters that would
not be simplified in context, or by simplifying words that would only be
used in a Traditional Chinese locale. But the resulting text is still a
reasonable key for looking up word frequenices.
"""
return text.translate(SIMPLIFIED_MAP).casefold()
def jieba_tokenize(text: str, external_wordlist: bool = False) -> list[str]:
"""
Tokenize the given text into tokens whose word frequencies can probably
be looked up. This uses Jieba, a word-frequency-based tokenizer.
If `external_wordlist` is False, we tell Jieba to default to using
wordfreq's own Chinese wordlist, and not to infer unknown words using a
hidden Markov model. This ensures that the multi-character tokens that it
outputs will be ones whose word frequencies we can look up.
If `external_wordlist` is True, this will use the largest version of
Jieba's original dictionary, with HMM enabled, so its results will be
independent of the data in wordfreq. These results will be better optimized
for purposes that aren't looking up word frequencies, such as general-
purpose tokenization, or collecting word frequencies in the first place.
"""
global jieba_tokenizer, jieba_orig_tokenizer
if external_wordlist:
if jieba_orig_tokenizer is None:
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
return jieba_orig_tokenizer.lcut(text)
else:
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
# Tokenize the Simplified Chinese version of the text, but return
# those spans from the original text, even if it's in Traditional
# Chinese
tokens = []
for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
tokens.append(text[start:end])
return tokens