actually, still delay loading the Jieba tokenizer

Former-commit-id: 48734d1a60
2024-12-23 09:21:37 +00:00 · 2015-09-22 16:54:39 -04:00 · 2015-09-22 16:54:39 -04:00 · b4628abb38
commit b4628abb38
parent 13642d6a4d
1 changed files with 5 additions and 3 deletions
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -6,7 +6,7 @@ import gzip
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
 SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
 SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
-JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME)
+jieba_tokenizer = None


 def simplify_chinese(text):
@ -14,5 +14,7 @@ def simplify_chinese(text):


 def jieba_tokenize(text):
-    return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False)
-
+    global jieba_tokenizer
+    if jieba_tokenizer is None:
+        jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
+    return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)