From 883aa5baeb914064d1e3b274a7c04be4542d654e Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 12 Jan 2016 12:59:18 -0500 Subject: [PATCH] Separate tokens with spaces, not line breaks, in intermediate files Former-commit-id: 115c74583eda6324104732712e3b238d9f0dcc56 --- wordfreq_builder/wordfreq_builder/tokenizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 0783f80..ae17546 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -95,7 +95,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer): text = line.split('\t')[-1].strip() language, tokens = tokenizer(text) if language != 'un': - tokenized = '\n'.join(tokens) + tokenized = ' '.join(tokens) out_filename = '%s.%s.txt' % (out_prefix, language) if out_filename in out_files: out_file = out_files[out_filename]