tokenize_file: don't join tokens if language is None

This commit is contained in:
Joshua Chin 2015-06-17 14:18:18 -04:00
parent 7fc0ba9092
commit b5bc39c893

View File

@ -66,8 +66,8 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
for line in in_file:
text = line_reader(line)
tokens, language = tokenizer(text)
tokenized = '\n'.join(tokens)
if language is not None:
tokenized = '\n'.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files:
out_file = out_files[out_filename]