Separate tokens with spaces, not line breaks, in intermediate files

This commit is contained in:
Rob Speer 2016-01-12 12:59:18 -05:00
parent 973caca253
commit 115c74583e

View File

@ -95,7 +95,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
text = line.split('\t')[-1].strip() text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text) language, tokens = tokenizer(text)
if language != 'un': if language != 'un':
tokenized = '\n'.join(tokens) tokenized = ' '.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language) out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files: if out_filename in out_files:
out_file = out_files[out_filename] out_file = out_files[out_filename]