mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
avoid log spam: only warn about an unsupported language once
This commit is contained in:
parent
c5f64a5de8
commit
b162de353d
@ -11,6 +11,7 @@ _mecab_tokenize = None
|
|||||||
_jieba_tokenize = None
|
_jieba_tokenize = None
|
||||||
_simplify_chinese = None
|
_simplify_chinese = None
|
||||||
|
|
||||||
|
_WARNED_LANGUAGES = set()
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -211,11 +212,13 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
# let's complain a bit if we ended up here because we don't have an
|
# let's complain a bit if we ended up here because we don't have an
|
||||||
# appropriate tokenizer.
|
# appropriate tokenizer.
|
||||||
if info['tokenizer'] != 'regex':
|
if info['tokenizer'] != 'regex':
|
||||||
logger.warning(
|
if lang not in _WARNED_LANGUAGES:
|
||||||
"The language '{}' is in the '{}' script, which we don't "
|
logger.warning(
|
||||||
"have a tokenizer for. The results will be bad."
|
"The language '{}' is in the '{}' script, which we don't "
|
||||||
.format(lang, info['script'])
|
"have a tokenizer for. The results will be bad."
|
||||||
)
|
.format(lang, info['script'])
|
||||||
|
)
|
||||||
|
_WARNED_LANGUAGES.add(lang)
|
||||||
tokens = simple_tokenize(text, include_punctuation=include_punctuation)
|
tokens = simple_tokenize(text, include_punctuation=include_punctuation)
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
Loading…
Reference in New Issue
Block a user