mirror of
https://github.com/rspeer/wordfreq.git
synced 2025-01-15 05:36:01 +00:00
automatically closes input file in tokenize_file
This commit is contained in:
parent
2fbfbfcc98
commit
7fc0ba9092
@ -62,18 +62,19 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
|||||||
to mark the token boundaries.
|
to mark the token boundaries.
|
||||||
"""
|
"""
|
||||||
out_files = {}
|
out_files = {}
|
||||||
for line in open(in_filename, encoding='utf-8'):
|
with open(in_filename, encoding='utf-8') as in_file:
|
||||||
text = line_reader(line)
|
for line in in_file:
|
||||||
tokens, language = tokenizer(text)
|
text = line_reader(line)
|
||||||
tokenized = '\n'.join(tokens)
|
tokens, language = tokenizer(text)
|
||||||
if language is not None:
|
tokenized = '\n'.join(tokens)
|
||||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
if language is not None:
|
||||||
if out_filename in out_files:
|
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||||
out_file = out_files[out_filename]
|
if out_filename in out_files:
|
||||||
else:
|
out_file = out_files[out_filename]
|
||||||
out_file = open(out_filename, 'w', encoding='utf-8')
|
else:
|
||||||
out_files[out_filename] = out_file
|
out_file = open(out_filename, 'w', encoding='utf-8')
|
||||||
print(tokenized, file=out_file)
|
out_files[out_filename] = out_file
|
||||||
|
print(tokenized, file=out_file)
|
||||||
for out_file in out_files.values():
|
for out_file in out_files.values():
|
||||||
out_file.close()
|
out_file.close()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user