remove unnecessary delayed loads in wordfreq.chinese

This commit is contained in:
Rob Speer 2015-09-22 16:42:13 -04:00
parent 6cf4210187
commit 4a87890afd

View File

@ -3,22 +3,16 @@ import jieba
import msgpack import msgpack
import gzip import gzip
jieba_tokenizer = None
simplified_map = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME)
def simplify_chinese(text): def simplify_chinese(text):
global simplified_map return text.translate(SIMPLIFIED_MAP).casefold()
if simplified_map is None:
simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
return text.translate(simplified_map).casefold()
def jieba_tokenize(text): def jieba_tokenize(text):
global jieba_tokenizer return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False)
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)