Define globals in relevant places

Former-commit-id: a6b6aa07e7
This commit is contained in:
Robyn Speer 2015-10-19 18:15:54 -04:00
parent 91a81c1bde
commit 5fedd71a66

View File

@ -84,6 +84,7 @@ def turkish_tokenize(text, include_punctuation=False):
return [token.strip("'").casefold() for token in token_expr.findall(text)] return [token.strip("'").casefold() for token in token_expr.findall(text)]
mecab_tokenize = None
def japanese_tokenize(text, include_punctuation=False): def japanese_tokenize(text, include_punctuation=False):
global mecab_tokenize global mecab_tokenize
if mecab_tokenize is None: if mecab_tokenize is None:
@ -93,6 +94,7 @@ def japanese_tokenize(text, include_punctuation=False):
return [token.casefold() for token in tokens if token_expr.match(token)] return [token.casefold() for token in tokens if token_expr.match(token)]
jieba_tokenize = None
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False): def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
global jieba_tokenize global jieba_tokenize
if jieba_tokenize is None: if jieba_tokenize is None:
@ -114,8 +116,6 @@ def remove_arabic_marks(text):
return ARABIC_MARK_RE.sub('', text) return ARABIC_MARK_RE.sub('', text)
mecab_tokenize = None
jieba_tokenize = None
def tokenize(text, lang, include_punctuation=False, external_wordlist=False): def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
""" """
Tokenize this text in a way that's relatively simple but appropriate for Tokenize this text in a way that's relatively simple but appropriate for