From c57032d5cbbaa967731e8d1bde43a445063e2fef Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Wed, 14 Mar 2018 15:07:45 -0400
Subject: [PATCH] code review fixes to wordfreq.tokens

---
 wordfreq/tokens.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index f2b84e4..8e1bb20 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -211,14 +211,13 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
         # This is the default case where we use the regex tokenizer. First
         # let's complain a bit if we ended up here because we don't have an
         # appropriate tokenizer.
-        if info['tokenizer'] != 'regex':
-            if lang not in _WARNED_LANGUAGES:
-                logger.warning(
-                    "The language '{}' is in the '{}' script, which we don't "
-                    "have a tokenizer for. The results will be bad."
-                    .format(lang, info['script'])
-                )
-                _WARNED_LANGUAGES.add(lang)
+        if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES:
+            logger.warning(
+                "The language '{}' is in the '{}' script, which we don't "
+                "have a tokenizer for. The results will be bad."
+                .format(lang, info['script'])
+            )
+            _WARNED_LANGUAGES.add(lang)
         tokens = simple_tokenize(text, include_punctuation=include_punctuation)
 
     return tokens
@@ -232,9 +231,12 @@ def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=Fals
 
     In particular:
 
-    - If a token has 2 adjacent digits, all its digits will be replaced with
-      the digit '0', so that frequencies for numbers don't have to be counted
-      separately. This is similar to word2vec, which replaces them with '#'.
+    - Any sequence of 2 or more adjacent digits, possibly with intervening
+      punctuation such as a decimal point, will replace each digit with '0'
+      so that frequencies for numbers don't have to be counted separately.
+
+      This is similar to but not quite identical to the word2vec Google News
+      data, which replaces digits with '#' in tokens with more than one digit.
 
     - In Chinese, unless Traditional Chinese is specifically requested using
       'zh-Hant', all characters will be converted to Simplified Chinese.