mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Separate tokens with spaces, not line breaks, in intermediate files
This commit is contained in:
parent
973caca253
commit
115c74583e
@ -95,7 +95,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
|||||||
text = line.split('\t')[-1].strip()
|
text = line.split('\t')[-1].strip()
|
||||||
language, tokens = tokenizer(text)
|
language, tokens = tokenizer(text)
|
||||||
if language != 'un':
|
if language != 'un':
|
||||||
tokenized = '\n'.join(tokens)
|
tokenized = ' '.join(tokens)
|
||||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||||
if out_filename in out_files:
|
if out_filename in out_files:
|
||||||
out_file = out_files[out_filename]
|
out_file = out_files[out_filename]
|
||||||
|
Loading…
Reference in New Issue
Block a user