fix documentation in wordfreq_builder.tokenizers

Former-commit-id: 8ddc19a5ca
This commit is contained in:
Rob Speer 2016-01-13 15:18:12 -05:00
parent 56f830d678
commit ee8cfb5a50

View File

@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {
def cld2_reddit_tokenizer(text): def cld2_reddit_tokenizer(text):
"""
A language-detecting tokenizer with special cases for handling text from
Reddit.
"""
text = URL_RE.sub('', text) text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
if lang not in KEEP_THESE_LANGUAGES: if lang not in KEEP_THESE_LANGUAGES:
# Reddit is 99.9% English, so if we detected a rare language, it's
# much more likely that it's actually English.
lang = 'en' lang = 'en'
tokens = tokenize(text, lang, include_punctuation=True) tokens = tokenize(text, lang, include_punctuation=True)
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
""" """
Process a file by running it through a given tokenizer. Process a file by running it through a given tokenizer.
Produces output files that are separated by language, with newlines Produces output files that are separated by language, with spaces
between the tokens. between the tokens.
""" """
out_files = {} out_files = {}