Replace multi-digit sequences with zeroes

This commit is contained in:
Robyn Speer 2016-12-09 15:55:08 -05:00
parent a8e2fa5acf
commit d6d528de74

View File

@ -97,6 +97,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
DIGIT_RE = regex.compile('\d')
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
def simple_tokenize(text, include_punctuation=False): def simple_tokenize(text, include_punctuation=False):
""" """
@ -129,9 +132,15 @@ def simple_tokenize(text, include_punctuation=False):
""" """
text = unicodedata.normalize('NFC', text) text = unicodedata.normalize('NFC', text)
if include_punctuation: if include_punctuation:
return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)] return [
smash_numbers(token.casefold())
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
]
else: else:
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] return [
smash_numbers(token.strip("'").casefold())
for token in TOKEN_RE.findall(text)
]
def turkish_tokenize(text, include_punctuation=False): def turkish_tokenize(text, include_punctuation=False):
@ -142,7 +151,7 @@ def turkish_tokenize(text, include_punctuation=False):
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [ return [
commas_to_cedillas(token.strip("'").casefold()) smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
for token in token_expr.findall(text) for token in token_expr.findall(text)
] ]
@ -154,7 +163,7 @@ def romanian_tokenize(text, include_punctuation=False):
""" """
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [ return [
cedillas_to_commas(token.strip("'").casefold()) smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
for token in token_expr.findall(text) for token in token_expr.findall(text)
] ]
@ -170,7 +179,8 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
from wordfreq.mecab import mecab_tokenize from wordfreq.mecab import mecab_tokenize
tokens = mecab_tokenize(text, lang) tokens = mecab_tokenize(text, lang)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)] return [smash_numbers(token.casefold()) for token in tokens
if token_expr.match(token)]
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False): def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
@ -182,7 +192,8 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
from wordfreq.chinese import jieba_tokenize from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist) tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)] return [smash_numbers(token.casefold()) for token in tokens
if token_expr.match(token)]
def remove_marks(text): def remove_marks(text):
@ -230,6 +241,21 @@ def cedillas_to_commas(text):
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}' '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
) )
def sub_zeroes(match):
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub('0', match.group(0))
def smash_numbers(text):
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
def tokenize(text, lang, include_punctuation=False, external_wordlist=False): def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
""" """