From 8ddc19a5ca598fc278a2f0a80c825ccdde5194fa Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 13 Jan 2016 15:18:12 -0500 Subject: [PATCH] fix documentation in wordfreq_builder.tokenizers --- wordfreq_builder/wordfreq_builder/tokenizers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index ae17546..b47e94a 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = { def cld2_reddit_tokenizer(text): + """ + A language-detecting tokenizer with special cases for handling text from + Reddit. + """ text = URL_RE.sub('', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) lang = cld2_detect_language(text) if lang not in KEEP_THESE_LANGUAGES: + # Reddit is 99.9% English, so if we detected a rare language, it's + # much more likely that it's actually English. lang = 'en' tokens = tokenize(text, lang, include_punctuation=True) @@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer): """ Process a file by running it through a given tokenizer. - Produces output files that are separated by language, with newlines + Produces output files that are separated by language, with spaces between the tokens. """ out_files = {}