Handle smashing numbers only at the end of tokenize().

This does make the code a lot clearer.
2024-12-23 09:21:37 +00:00 · 2017-01-11 19:04:19 -05:00 · 2017-01-11 19:04:19 -05:00 · abd0820a32
commit abd0820a32
parent 93306e55a0
1 changed files with 17 additions and 42 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -101,7 +101,7 @@ DIGIT_RE = regex.compile('\d')
 MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')


-def simple_tokenize(text, include_punctuation=False, combine_numbers=False):
+def simple_tokenize(text, include_punctuation=False):
    """
    Tokenize the given text using a straightforward, Unicode-aware token
    expression.
@ -121,11 +121,6 @@ def simple_tokenize(text, include_punctuation=False, combine_numbers=False):
      such as emoji. If `include_punctuation` is True, it outputs all non-space
      tokens.

-    - If `combine_numbers` is True, then multi-digit numbers will be replaced
-      by strings of zeroes. When looking up word frequencies, this allows all
-      numbers of the same length to be treated as the same "word", avoiding
-      unnecessarily sparse data.
-
    - It breaks on all spaces, even the "non-breaking" ones.

    - It aims to keep marks together with words, so that they aren't erroneously
@ -136,23 +131,18 @@ def simple_tokenize(text, include_punctuation=False, combine_numbers=False):
      would end up in its own token, which is worse.
    """
    text = unicodedata.normalize('NFC', text)
-    if combine_numbers:
-        postprocess = smash_numbers
-    else:
-        postprocess = _identity
    if include_punctuation:
        return [
-            postprocess(token.casefold())
+            token.casefold()
            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
        ]
    else:
        return [
-            postprocess(token.strip("'").casefold())
+            token.strip("'").casefold()
            for token in TOKEN_RE.findall(text)
        ]

-def tokenize_mecab_language(text, lang, include_punctuation=False,
-                            combine_numbers=False):
+def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
    """
@ -161,32 +151,21 @@ def tokenize_mecab_language(text, lang, include_punctuation=False,
        raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
    if mecab_tokenize is None:
        from wordfreq.mecab import mecab_tokenize
-    if combine_numbers:
-        postprocess = smash_numbers
-    else:
-        postprocess = _identity
    tokens = mecab_tokenize(text, lang)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [postprocess(token.casefold()) for token in tokens
-            if token_expr.match(token)]
+    return [token.casefold() for token in tokens if token_expr.match(token)]


-def chinese_tokenize(text, include_punctuation=False, external_wordlist=False,
-                     combine_numbers=False):
+def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    """
    Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
    """
    global jieba_tokenize
    if jieba_tokenize is None:
        from wordfreq.chinese import jieba_tokenize
-    if combine_numbers:
-        postprocess = smash_numbers
-    else:
-        postprocess = _identity
    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [postprocess(token.casefold()) for token in tokens
-            if token_expr.match(token)]
+    return [token.casefold() for token in tokens if token_expr.match(token)]


 def remove_marks(text):
@ -274,13 +253,6 @@ def smash_numbers(text):
    return MULTI_DIGIT_RE.sub(sub_zeroes, text)


-def _identity(text):
-    """
-    The identity function, as an alternative to smashing numbers.
-    """
-    return text
-
-
 def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
             combine_numbers=False):
    """
@ -393,20 +365,23 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
    # language
    lang = lang.split('-')[0]
    if lang == 'ja' or lang == 'ko':
-        return tokenize_mecab_language(text, lang, include_punctuation, combine_numbers)
+        result = tokenize_mecab_language(text, lang, include_punctuation)
    elif lang == 'zh':
-        return chinese_tokenize(text, include_punctuation, external_wordlist, combine_numbers)
+        result = chinese_tokenize(text, include_punctuation, external_wordlist)
    elif lang == 'tr':
-        return simple_tokenize(preprocess_turkish(text), include_punctuation, combine_numbers)
+        result = simple_tokenize(preprocess_turkish(text), include_punctuation)
    elif lang == 'ro':
-        return simple_tokenize(preprocess_romanian(text), include_punctuation, combine_numbers)
+        result = simple_tokenize(preprocess_romanian(text), include_punctuation)
    elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
        # These are the three language codes that could include Serbian text,
        # which could be in Cyrillic.
-        return simple_tokenize(preprocess_serbian(text), include_punctuation, combine_numbers)
+        result = simple_tokenize(preprocess_serbian(text), include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
-        return simple_tokenize(text, include_punctuation, combine_numbers)
+        result = simple_tokenize(text, include_punctuation)
    else:
-        return simple_tokenize(text, include_punctuation, combine_numbers)
+        result = simple_tokenize(text, include_punctuation)

+    if combine_numbers:
+        result = [smash_numbers(token) for token in result]
+    return result