mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Avoid Chinese tokenizer when building
This commit is contained in:
parent
9071defb33
commit
77a9b5c55b
@ -32,6 +32,12 @@ def cld2_surface_tokenizer(text):
|
|||||||
text = TWITTER_HANDLE_RE.sub('', text)
|
text = TWITTER_HANDLE_RE.sub('', text)
|
||||||
text = TCO_RE.sub('', text)
|
text = TCO_RE.sub('', text)
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
|
|
||||||
|
# Don't allow tokenization in Chinese when language-detecting, because
|
||||||
|
# the Chinese tokenizer may not be built yet
|
||||||
|
if lang == 'zh':
|
||||||
|
lang = 'en'
|
||||||
|
|
||||||
tokens = tokenize(text, lang)
|
tokens = tokenize(text, lang)
|
||||||
return lang, tokens
|
return lang, tokens
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user