From a893823d6ea444d0d2a33759fcdb2e331559bdb8 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Wed, 26 Aug 2015 13:03:23 -0400
Subject: [PATCH] un-flake wordfreq_builder.tokenizers, and edit docstrings

---
 wordfreq_builder/wordfreq_builder/tokenizers.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 997c0c8..92d0714 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -1,4 +1,3 @@
-from html.entities import name2codepoint
 from wordfreq import tokenize
 from ftfy.fixes import unescape_html
 import regex
@@ -26,7 +25,7 @@ TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
 
 def cld2_surface_tokenizer(text):
     """
-    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
+    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
     """
     text = unescape_html(text)
     text = TWITTER_HANDLE_RE.sub('', text)
@@ -38,7 +37,7 @@ def cld2_surface_tokenizer(text):
 
 def cld2_detect_language(text):
     """
-    Uses CLD2 to detect the language
+    Uses CLD2 to detect the language.
     """
     # Format of pycld2.detect:
     #   (Confident in result: bool,
@@ -55,9 +54,12 @@ def cld2_detect_language(text):
 
 def tokenize_twitter(in_filename, out_prefix, tokenizer):
     """
-    Process a file by running it through the given tokenizer, sorting the
-    results by the language of each line, and inserting newlines
-    to mark the token boundaries.
+    Process a file by running it through the Twitter-specific tokenizer,
+    which uses cld2 for language detection, and removes Twitter handles
+    and t.co URLs.
+
+    Produces output files that are separated by language, with newlines
+    between the tokens.
     """
     out_files = {}
     with open(in_filename, encoding='utf-8') as in_file: