mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
fix documentation in wordfreq_builder.tokenizers
This commit is contained in:
parent
511fcb6f91
commit
8ddc19a5ca
@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {
|
|||||||
|
|
||||||
|
|
||||||
def cld2_reddit_tokenizer(text):
|
def cld2_reddit_tokenizer(text):
|
||||||
|
"""
|
||||||
|
A language-detecting tokenizer with special cases for handling text from
|
||||||
|
Reddit.
|
||||||
|
"""
|
||||||
text = URL_RE.sub('', text)
|
text = URL_RE.sub('', text)
|
||||||
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||||
|
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
if lang not in KEEP_THESE_LANGUAGES:
|
if lang not in KEEP_THESE_LANGUAGES:
|
||||||
|
# Reddit is 99.9% English, so if we detected a rare language, it's
|
||||||
|
# much more likely that it's actually English.
|
||||||
lang = 'en'
|
lang = 'en'
|
||||||
|
|
||||||
tokens = tokenize(text, lang, include_punctuation=True)
|
tokens = tokenize(text, lang, include_punctuation=True)
|
||||||
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
|||||||
"""
|
"""
|
||||||
Process a file by running it through a given tokenizer.
|
Process a file by running it through a given tokenizer.
|
||||||
|
|
||||||
Produces output files that are separated by language, with newlines
|
Produces output files that are separated by language, with spaces
|
||||||
between the tokens.
|
between the tokens.
|
||||||
"""
|
"""
|
||||||
out_files = {}
|
out_files = {}
|
||||||
|
Loading…
Reference in New Issue
Block a user