Replace multi-digit sequences with zeroes

2024-12-24 01:41:39 +00:00 · 2016-12-09 15:55:08 -05:00 · 2016-12-09 15:55:08 -05:00 · bb5df3b074
commit bb5df3b074
parent 24e26c4c1d
1 changed files with 32 additions and 6 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -97,6 +97,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""

 MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)

+DIGIT_RE = regex.compile('\d')
+MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
+

 def simple_tokenize(text, include_punctuation=False):
    """
@ -129,9 +132,15 @@ def simple_tokenize(text, include_punctuation=False):
    """
    text = unicodedata.normalize('NFC', text)
    if include_punctuation:
-        return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
+        return [
+            smash_numbers(token.casefold())
+            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
+        ]
    else:
-        return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+        return [
+            smash_numbers(token.strip("'").casefold())
+            for token in TOKEN_RE.findall(text)
+        ]


 def turkish_tokenize(text, include_punctuation=False):
@ -142,7 +151,7 @@ def turkish_tokenize(text, include_punctuation=False):
    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [
-        commas_to_cedillas(token.strip("'").casefold())
+        smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
        for token in token_expr.findall(text)
    ]

@ -154,7 +163,7 @@ def romanian_tokenize(text, include_punctuation=False):
    """
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [
-        cedillas_to_commas(token.strip("'").casefold())
+        smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
        for token in token_expr.findall(text)
    ]

@ -170,7 +179,8 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
        from wordfreq.mecab import mecab_tokenize
    tokens = mecab_tokenize(text, lang)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [token.casefold() for token in tokens if token_expr.match(token)]
+    return [smash_numbers(token.casefold()) for token in tokens
+            if token_expr.match(token)]


 def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
@ -182,7 +192,8 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
        from wordfreq.chinese import jieba_tokenize
    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [token.casefold() for token in tokens if token_expr.match(token)]
+    return [smash_numbers(token.casefold()) for token in tokens
+            if token_expr.match(token)]


 def remove_marks(text):
@ -230,6 +241,21 @@ def cedillas_to_commas(text):
        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
    )

+def sub_zeroes(match):
+    """
+    Given a regex match, return what it matched with digits replaced by
+    zeroes.
+    """
+    return DIGIT_RE.sub('0', match.group(0))
+
+
+def smash_numbers(text):
+    """
+    Replace sequences of multiple digits with zeroes, so we don't need to
+    distinguish the frequencies of thousands of numbers.
+    """
+    return MULTI_DIGIT_RE.sub(sub_zeroes, text)
+

 def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    """