changed tokenize_file: cld2 return 'un' instead of None if it cannot recognize the language

This commit is contained in:
Joshua Chin 2015-06-17 14:19:28 -04:00
parent b24f31d30a
commit 8277de2c7f

View File

@ -66,7 +66,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
for line in in_file: for line in in_file:
text = line_reader(line) text = line_reader(line)
tokens, language = tokenizer(text) tokens, language = tokenizer(text)
if language is not None: if language == 'un':
tokenized = '\n'.join(tokens) tokenized = '\n'.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language) out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files: if out_filename in out_files: