updated monolingual_tokenize_file docstring, and removed unused argument

This commit is contained in:
Joshua Chin 2015-06-18 10:20:54 -04:00
parent 34e9512517
commit 18b53f6071

View File

@ -93,11 +93,16 @@ def fix_entities(text):
def monolingual_tokenize_file(in_filename, out_filename, language, def monolingual_tokenize_file(in_filename, out_filename, language,
tokenizer, line_reader=last_tab, tokenizer, line_reader=last_tab,
token_filter=lowercase_text_filter, sample_proportion=1):
sample_proportion=100):
""" """
Apply a tokenizer that can distinguish different languages, but only Process a file by running it through the given tokenizer, only keeping
keep the lines that are in the language we're asking for. lines of the language we're asking for, and inserting newlines
to mark the token boundaries.
`line_reader` is applied to each line before it given to the tokenizer
Only the first line out of every `sample_proportion` lines are run through
then tokenizer.
""" """
with open(in_filename, encoding='utf-8', errors='replace') as in_file: with open(in_filename, encoding='utf-8', errors='replace') as in_file:
with open(out_filename, 'w', encoding='utf-8') as out_file: with open(out_filename, 'w', encoding='utf-8') as out_file: