refactor the tokenizer, add include_punctuation option

2024-12-23 17:31:41 +00:00 · 2015-09-15 13:26:09 -04:00 · 2015-09-15 13:26:09 -04:00 · e8e6e0a231
commit e8e6e0a231
parent 669bd16c13
2 changed files with 61 additions and 35 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -21,17 +21,19 @@ def test_languages():
    avail = available_languages()
    assert_greater(len(avail), 15)

-    # Laughter is the universal language
+    # Laughter is the universal language. Look up either 'lol' or '笑' in each
+    # language and make sure it has a non-zero frequency.
    for lang in avail:
-        if lang not in {'zh', 'ja'}:
-            # we do not have enough Chinese data
-            # Japanese people do not lol
-            assert_greater(word_frequency('lol', lang), 0)
+        if lang in {'zh', 'ja'}:
+            text = '笑'
+        else:
+            text = 'lol'
+        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-            assert_greater(word_frequency('lol', new_lang_code), 0)
+        assert_greater(word_frequency(text, new_lang_code), 0)


 def test_twitter():
@ -99,6 +101,9 @@ def test_tokenization():
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
    
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
+        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
+
    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])
@ -108,6 +113,9 @@ def test_tokenization():

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])

+    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
+        ['this', 'text', 'has', '...', 'punctuation', ':)'])
+

 def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -39,10 +39,16 @@ TOKEN_RE = regex.compile(r"""
    (?:\B\S)*
 """, regex.V1 | regex.WORD | regex.VERBOSE)

+TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
+    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+    [\p{punct}]+ |
+    \S(?:\B\S)*
+""", regex.V1 | regex.WORD | regex.VERBOSE)
+
 ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)


-def simple_tokenize(text):
+def simple_tokenize(text, include_punctuation=False):
    """
    Tokenize the given text using a straightforward, Unicode-aware token
    expression.
@ -57,22 +63,44 @@ def simple_tokenize(text):
      ideograms and hiragana) relatively untokenized, instead of splitting each
      character into its own token.

-    - It outputs only the tokens that start with a word-like character, or
-      miscellaneous symbols such as emoji.
+    - If `include_punctuation` is False (the default), it outputs only the
+      tokens that start with a word-like character, or miscellaneous symbols
+      such as emoji. If `include_punctuation` is True, it outputs all non-space
+      tokens.

    - It breaks on all spaces, even the "non-breaking" ones.
    """
    text = unicodedata.normalize('NFC', text)
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.strip("'").casefold() for token in token_expr.findall(text)]


-def turkish_tokenize(text):
+def turkish_tokenize(text, include_punctuation=False):
    """
    Like `simple_tokenize`, but modifies i's so that they case-fold correctly
    in Turkish.
    """
    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.strip("'").casefold() for token in token_expr.findall(text)]
+
+
+def japanese_tokenize(text, include_punctuation=False):
+    global mecab_tokenize
+    if mecab_tokenize is None:
+        from wordfreq.japanese import mecab_tokenize
+    tokens = mecab_tokenize(text)
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.casefold() for token in tokens if token_expr.match(token)]
+
+
+def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
+    global jieba_tokenize
+    if jieba_tokenize is None:
+        from wordfreq.chinese import jieba_tokenize
+    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.casefold() for token in tokens if token_expr.match(token)]


 def remove_arabic_marks(text):
@ -89,7 +117,7 @@ def remove_arabic_marks(text):

 mecab_tokenize = None
 jieba_tokenize = None
-def tokenize(text, lang, external_wordlist=False):
+def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language.
@ -124,24 +152,14 @@ def tokenize(text, lang, external_wordlist=False):
    first, so that they can be expected to match the data.
    """
    if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.japanese import mecab_tokenize
-        tokens = mecab_tokenize(text)
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
-
-    if lang == 'zh':
-        global jieba_tokenize
-        if jieba_tokenize is None:
-            from wordfreq.chinese import jieba_tokenize
-        tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
-
-    if lang == 'tr':
-        return turkish_tokenize(text)
-
-    if lang == 'ar':
+        return japanese_tokenize(text, include_punctuation)
+    elif lang == 'zh':
+        return chinese_tokenize(text, include_punctuation, external_wordlist)
+    elif lang == 'tr':
+        return turkish_tokenize(text, include_punctuation)
+    elif lang == 'ar':
        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
-
-    return simple_tokenize(text)
+        return simple_tokenize(text, include_punctuation)
+    else:
+        return simple_tokenize(text, include_punctuation)