mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
updated monolingual_tokenize_file docstring, and removed unused argument
This commit is contained in:
parent
34e9512517
commit
18b53f6071
@ -93,11 +93,16 @@ def fix_entities(text):
|
|||||||
|
|
||||||
def monolingual_tokenize_file(in_filename, out_filename, language,
|
def monolingual_tokenize_file(in_filename, out_filename, language,
|
||||||
tokenizer, line_reader=last_tab,
|
tokenizer, line_reader=last_tab,
|
||||||
token_filter=lowercase_text_filter,
|
sample_proportion=1):
|
||||||
sample_proportion=100):
|
|
||||||
"""
|
"""
|
||||||
Apply a tokenizer that can distinguish different languages, but only
|
Process a file by running it through the given tokenizer, only keeping
|
||||||
keep the lines that are in the language we're asking for.
|
lines of the language we're asking for, and inserting newlines
|
||||||
|
to mark the token boundaries.
|
||||||
|
|
||||||
|
`line_reader` is applied to each line before it given to the tokenizer
|
||||||
|
|
||||||
|
Only the first line out of every `sample_proportion` lines are run through
|
||||||
|
then tokenizer.
|
||||||
"""
|
"""
|
||||||
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
|
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
|
||||||
with open(out_filename, 'w', encoding='utf-8') as out_file:
|
with open(out_filename, 'w', encoding='utf-8') as out_file:
|
||||||
|
Loading…
Reference in New Issue
Block a user