fix documentation in wordfreq_builder.tokenizers

Former-commit-id: 8ddc19a5ca
2024-12-23 09:21:37 +00:00 · 2016-01-13 15:18:12 -05:00 · 2016-01-13 15:18:12 -05:00 · 6eca3cff5a
commit 6eca3cff5a
parent 95cdf41fe8
1 changed files with 7 additions and 1 deletions
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {


 def cld2_reddit_tokenizer(text):
+    """
+    A language-detecting tokenizer with special cases for handling text from
+    Reddit.
+    """
    text = URL_RE.sub('', text)
    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)
    if lang not in KEEP_THESE_LANGUAGES:
+        # Reddit is 99.9% English, so if we detected a rare language, it's
+        # much more likely that it's actually English.
        lang = 'en'

    tokens = tokenize(text, lang, include_punctuation=True)
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
    """
    Process a file by running it through a given tokenizer.

-    Produces output files that are separated by language, with newlines
+    Produces output files that are separated by language, with spaces
    between the tokens.
    """
    out_files = {}