actually, still delay loading the Jieba tokenizer

Former-commit-id: 48734d1a60
This commit is contained in:
Robyn Speer 2015-09-22 16:54:39 -04:00
parent 13642d6a4d
commit b4628abb38

View File

@ -6,7 +6,7 @@ import gzip
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME)
jieba_tokenizer = None
def simplify_chinese(text):
@ -14,5 +14,7 @@ def simplify_chinese(text):
def jieba_tokenize(text):
return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False)
global jieba_tokenizer
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)