tokenize_file should ignore lines with unknown languages

This commit is contained in:
Joshua Chin 2015-06-18 10:18:57 -04:00
parent 2f4fe92c90
commit 34e9512517

View File

@ -67,8 +67,8 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
with open(in_filename, encoding='utf-8') as in_file:
for line in in_file:
text = line_reader(line)
tokens, language = tokenizer(text)
if language == 'un':
language, tokens = tokenizer(text)
if language != 'un':
tokenized = '\n'.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files: