mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
actually, still delay loading the Jieba tokenizer
This commit is contained in:
parent
7a3ea2bf79
commit
48734d1a60
@ -6,7 +6,7 @@ import gzip
|
|||||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||||
JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
jieba_tokenizer = None
|
||||||
|
|
||||||
|
|
||||||
def simplify_chinese(text):
|
def simplify_chinese(text):
|
||||||
@ -14,5 +14,7 @@ def simplify_chinese(text):
|
|||||||
|
|
||||||
|
|
||||||
def jieba_tokenize(text):
|
def jieba_tokenize(text):
|
||||||
return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False)
|
global jieba_tokenizer
|
||||||
|
if jieba_tokenizer is None:
|
||||||
|
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||||
|
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||||
|
Loading…
Reference in New Issue
Block a user