mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-26 02:28:50 +00:00
changed tokenize_file: cld2 return 'un' instead of None if it cannot recognize the language
This commit is contained in:
parent
b24f31d30a
commit
8277de2c7f
@ -66,7 +66,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
|||||||
for line in in_file:
|
for line in in_file:
|
||||||
text = line_reader(line)
|
text = line_reader(line)
|
||||||
tokens, language = tokenizer(text)
|
tokens, language = tokenizer(text)
|
||||||
if language is not None:
|
if language == 'un':
|
||||||
tokenized = '\n'.join(tokens)
|
tokenized = '\n'.join(tokens)
|
||||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||||
if out_filename in out_files:
|
if out_filename in out_files:
|
||||||
|
Loading…
Reference in New Issue
Block a user