Avoid Chinese tokenizer when building

This commit is contained in:
Rob Speer 2015-09-08 12:59:03 -04:00
parent 9071defb33
commit 77a9b5c55b

View File

@ -32,6 +32,12 @@ def cld2_surface_tokenizer(text):
text = TWITTER_HANDLE_RE.sub('', text) text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text) text = TCO_RE.sub('', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
# Don't allow tokenization in Chinese when language-detecting, because
# the Chinese tokenizer may not be built yet
if lang == 'zh':
lang = 'en'
tokens = tokenize(text, lang) tokens = tokenize(text, lang)
return lang, tokens return lang, tokens