mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
Replace multi-digit sequences with zeroes
This commit is contained in:
parent
24e26c4c1d
commit
bb5df3b074
@ -97,6 +97,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||
|
||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
DIGIT_RE = regex.compile('\d')
|
||||
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||
|
||||
|
||||
def simple_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
@ -129,9 +132,15 @@ def simple_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
if include_punctuation:
|
||||
return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
|
||||
return [
|
||||
smash_numbers(token.casefold())
|
||||
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
||||
]
|
||||
else:
|
||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||
return [
|
||||
smash_numbers(token.strip("'").casefold())
|
||||
for token in TOKEN_RE.findall(text)
|
||||
]
|
||||
|
||||
|
||||
def turkish_tokenize(text, include_punctuation=False):
|
||||
@ -142,7 +151,7 @@ def turkish_tokenize(text, include_punctuation=False):
|
||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [
|
||||
commas_to_cedillas(token.strip("'").casefold())
|
||||
smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
|
||||
for token in token_expr.findall(text)
|
||||
]
|
||||
|
||||
@ -154,7 +163,7 @@ def romanian_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [
|
||||
cedillas_to_commas(token.strip("'").casefold())
|
||||
smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
|
||||
for token in token_expr.findall(text)
|
||||
]
|
||||
|
||||
@ -170,7 +179,8 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
tokens = mecab_tokenize(text, lang)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
return [smash_numbers(token.casefold()) for token in tokens
|
||||
if token_expr.match(token)]
|
||||
|
||||
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
@ -182,7 +192,8 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
return [smash_numbers(token.casefold()) for token in tokens
|
||||
if token_expr.match(token)]
|
||||
|
||||
|
||||
def remove_marks(text):
|
||||
@ -230,6 +241,21 @@ def cedillas_to_commas(text):
|
||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||
)
|
||||
|
||||
def sub_zeroes(match):
|
||||
"""
|
||||
Given a regex match, return what it matched with digits replaced by
|
||||
zeroes.
|
||||
"""
|
||||
return DIGIT_RE.sub('0', match.group(0))
|
||||
|
||||
|
||||
def smash_numbers(text):
|
||||
"""
|
||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||
distinguish the frequencies of thousands of numbers.
|
||||
"""
|
||||
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
||||
|
||||
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user