remove unnecessary delayed loads in wordfreq.chinese

This commit is contained in:
Rob Speer 2015-09-22 16:42:13 -04:00
parent 6cf4210187
commit 4a87890afd

View File

@ -3,22 +3,16 @@ import jieba
import msgpack
import gzip
jieba_tokenizer = None
simplified_map = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME)
def simplify_chinese(text):
global simplified_map
if simplified_map is None:
simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
return text.translate(simplified_map).casefold()
return text.translate(SIMPLIFIED_MAP).casefold()
def jieba_tokenize(text):
global jieba_tokenizer
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False)