Avoid Chinese tokenizer when building

Former-commit-id: 77a9b5c55b
This commit is contained in:
Robyn Speer 2015-09-08 12:59:03 -04:00
parent 3fa14ded28
commit 099d90b700

View File

@ -32,6 +32,12 @@ def cld2_surface_tokenizer(text):
text = TWITTER_HANDLE_RE.sub('', text) text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text) text = TCO_RE.sub('', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
# Don't allow tokenization in Chinese when language-detecting, because
# the Chinese tokenizer may not be built yet
if lang == 'zh':
lang = 'en'
tokens = tokenize(text, lang) tokens = tokenize(text, lang)
return lang, tokens return lang, tokens