mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
3fa14ded28
commit
099d90b700
@ -32,6 +32,12 @@ def cld2_surface_tokenizer(text):
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
text = TCO_RE.sub('', text)
|
||||
lang = cld2_detect_language(text)
|
||||
|
||||
# Don't allow tokenization in Chinese when language-detecting, because
|
||||
# the Chinese tokenizer may not be built yet
|
||||
if lang == 'zh':
|
||||
lang = 'en'
|
||||
|
||||
tokens = tokenize(text, lang)
|
||||
return lang, tokens
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user