From 099d90b700fd50e586019d07f53c2f530d3d080b Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 8 Sep 2015 12:59:03 -0400 Subject: [PATCH] Avoid Chinese tokenizer when building Former-commit-id: 77a9b5c55b89bfcf59555523c7a59fb2b249627b --- wordfreq_builder/wordfreq_builder/tokenizers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 1a75626..7d18026 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -32,6 +32,12 @@ def cld2_surface_tokenizer(text): text = TWITTER_HANDLE_RE.sub('', text) text = TCO_RE.sub('', text) lang = cld2_detect_language(text) + + # Don't allow tokenization in Chinese when language-detecting, because + # the Chinese tokenizer may not be built yet + if lang == 'zh': + lang = 'en' + tokens = tokenize(text, lang) return lang, tokens