automatically closes input file in tokenize_file

This commit is contained in:
Joshua Chin 2015-06-17 11:42:34 -04:00
parent 2fbfbfcc98
commit 7fc0ba9092

View File

@ -62,18 +62,19 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
to mark the token boundaries. to mark the token boundaries.
""" """
out_files = {} out_files = {}
for line in open(in_filename, encoding='utf-8'): with open(in_filename, encoding='utf-8') as in_file:
text = line_reader(line) for line in in_file:
tokens, language = tokenizer(text) text = line_reader(line)
tokenized = '\n'.join(tokens) tokens, language = tokenizer(text)
if language is not None: tokenized = '\n'.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language) if language is not None:
if out_filename in out_files: out_filename = '%s.%s.txt' % (out_prefix, language)
out_file = out_files[out_filename] if out_filename in out_files:
else: out_file = out_files[out_filename]
out_file = open(out_filename, 'w', encoding='utf-8') else:
out_files[out_filename] = out_file out_file = open(out_filename, 'w', encoding='utf-8')
print(tokenized, file=out_file) out_files[out_filename] = out_file
print(tokenized, file=out_file)
for out_file in out_files.values(): for out_file in out_files.values():
out_file.close() out_file.close()