mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
actually, still delay loading the Jieba tokenizer
Former-commit-id: 48734d1a60
This commit is contained in:
parent
13642d6a4d
commit
b4628abb38
@ -6,7 +6,7 @@ import gzip
|
||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||
JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
jieba_tokenizer = None
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
@ -14,5 +14,7 @@ def simplify_chinese(text):
|
||||
|
||||
|
||||
def jieba_tokenize(text):
|
||||
return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False)
|
||||
|
||||
global jieba_tokenizer
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
|
Loading…
Reference in New Issue
Block a user