From e8e6e0a23196abf0ecc0cf3bc72ba9943226d119 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Tue, 15 Sep 2015 13:26:09 -0400
Subject: [PATCH] refactor the tokenizer, add `include_punctuation` option

---
 tests/test.py      | 26 +++++++++++------
 wordfreq/tokens.py | 70 +++++++++++++++++++++++++++++-----------------
 2 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index dd26750..0013dcb 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -21,17 +21,19 @@ def test_languages():
     avail = available_languages()
     assert_greater(len(avail), 15)
 
-    # Laughter is the universal language
+    # Laughter is the universal language. Look up either 'lol' or '笑' in each
+    # language and make sure it has a non-zero frequency.
     for lang in avail:
-        if lang not in {'zh', 'ja'}:
-            # we do not have enough Chinese data
-            # Japanese people do not lol
-            assert_greater(word_frequency('lol', lang), 0)
+        if lang in {'zh', 'ja'}:
+            text = '笑'
+        else:
+            text = 'lol'
+        assert_greater(word_frequency(text, lang), 0)
 
-            # Make up a weirdly verbose language code and make sure
-            # we still get it
-            new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-            assert_greater(word_frequency('lol', new_lang_code), 0)
+        # Make up a weirdly verbose language code and make sure
+        # we still get it
+        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
+        assert_greater(word_frequency(text, new_lang_code), 0)
 
 
 def test_twitter():
@@ -98,6 +100,9 @@ def test_tokenization():
     # data
     eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
         ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
+    
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
+        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
 
     # Certain punctuation does not inherently split a word.
     eq_(tokenize("Anything is possible at zombo.com", 'en'),
@@ -108,6 +113,9 @@ def test_tokenization():
 
     eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
 
+    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
+        ['this', 'text', 'has', '...', 'punctuation', ':)'])
+
 
 def test_casefolding():
     eq_(tokenize('WEISS', 'de'), ['weiss'])
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index a2c308c..ad64bcd 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -39,10 +39,16 @@ TOKEN_RE = regex.compile(r"""
     (?:\B\S)*
 """, regex.V1 | regex.WORD | regex.VERBOSE)
 
+TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
+    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+    [\p{punct}]+ |
+    \S(?:\B\S)*
+""", regex.V1 | regex.WORD | regex.VERBOSE)
+
 ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
 
 
-def simple_tokenize(text):
+def simple_tokenize(text, include_punctuation=False):
     """
     Tokenize the given text using a straightforward, Unicode-aware token
     expression.
@@ -57,22 +63,44 @@ def simple_tokenize(text):
       ideograms and hiragana) relatively untokenized, instead of splitting each
       character into its own token.
 
-    - It outputs only the tokens that start with a word-like character, or
-      miscellaneous symbols such as emoji.
+    - If `include_punctuation` is False (the default), it outputs only the
+      tokens that start with a word-like character, or miscellaneous symbols
+      such as emoji. If `include_punctuation` is True, it outputs all non-space
+      tokens.
 
     - It breaks on all spaces, even the "non-breaking" ones.
     """
     text = unicodedata.normalize('NFC', text)
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.strip("'").casefold() for token in token_expr.findall(text)]
 
 
-def turkish_tokenize(text):
+def turkish_tokenize(text, include_punctuation=False):
     """
     Like `simple_tokenize`, but modifies i's so that they case-fold correctly
     in Turkish.
     """
     text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.strip("'").casefold() for token in token_expr.findall(text)]
+
+
+def japanese_tokenize(text, include_punctuation=False):
+    global mecab_tokenize
+    if mecab_tokenize is None:
+        from wordfreq.japanese import mecab_tokenize
+    tokens = mecab_tokenize(text)
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.casefold() for token in tokens if token_expr.match(token)]
+
+
+def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
+    global jieba_tokenize
+    if jieba_tokenize is None:
+        from wordfreq.chinese import jieba_tokenize
+    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.casefold() for token in tokens if token_expr.match(token)]
 
 
 def remove_arabic_marks(text):
@@ -89,7 +117,7 @@ def remove_arabic_marks(text):
 
 mecab_tokenize = None
 jieba_tokenize = None
-def tokenize(text, lang, external_wordlist=False):
+def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     """
     Tokenize this text in a way that's relatively simple but appropriate for
     the language.
@@ -124,24 +152,14 @@ def tokenize(text, lang, external_wordlist=False):
     first, so that they can be expected to match the data.
     """
     if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.japanese import mecab_tokenize
-        tokens = mecab_tokenize(text)
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
-
-    if lang == 'zh':
-        global jieba_tokenize
-        if jieba_tokenize is None:
-            from wordfreq.chinese import jieba_tokenize
-        tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
-
-    if lang == 'tr':
-        return turkish_tokenize(text)
-
-    if lang == 'ar':
+        return japanese_tokenize(text, include_punctuation)
+    elif lang == 'zh':
+        return chinese_tokenize(text, include_punctuation, external_wordlist)
+    elif lang == 'tr':
+        return turkish_tokenize(text, include_punctuation)
+    elif lang == 'ar':
         text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
-
-    return simple_tokenize(text)
+        return simple_tokenize(text, include_punctuation)
+    else:
+        return simple_tokenize(text, include_punctuation)