mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Replace multi-digit sequences with zeroes
This commit is contained in:
parent
a8e2fa5acf
commit
d6d528de74
@ -97,6 +97,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
|||||||
|
|
||||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
DIGIT_RE = regex.compile('\d')
|
||||||
|
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text, include_punctuation=False):
|
def simple_tokenize(text, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
@ -129,9 +132,15 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
if include_punctuation:
|
if include_punctuation:
|
||||||
return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
|
return [
|
||||||
|
smash_numbers(token.casefold())
|
||||||
|
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
return [
|
||||||
|
smash_numbers(token.strip("'").casefold())
|
||||||
|
for token in TOKEN_RE.findall(text)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def turkish_tokenize(text, include_punctuation=False):
|
def turkish_tokenize(text, include_punctuation=False):
|
||||||
@ -142,7 +151,7 @@ def turkish_tokenize(text, include_punctuation=False):
|
|||||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
return [
|
return [
|
||||||
commas_to_cedillas(token.strip("'").casefold())
|
smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
|
||||||
for token in token_expr.findall(text)
|
for token in token_expr.findall(text)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -154,7 +163,7 @@ def romanian_tokenize(text, include_punctuation=False):
|
|||||||
"""
|
"""
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
return [
|
return [
|
||||||
cedillas_to_commas(token.strip("'").casefold())
|
smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
|
||||||
for token in token_expr.findall(text)
|
for token in token_expr.findall(text)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -170,7 +179,8 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
|
|||||||
from wordfreq.mecab import mecab_tokenize
|
from wordfreq.mecab import mecab_tokenize
|
||||||
tokens = mecab_tokenize(text, lang)
|
tokens = mecab_tokenize(text, lang)
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
return [smash_numbers(token.casefold()) for token in tokens
|
||||||
|
if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||||
@ -182,7 +192,8 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
|||||||
from wordfreq.chinese import jieba_tokenize
|
from wordfreq.chinese import jieba_tokenize
|
||||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
return [smash_numbers(token.casefold()) for token in tokens
|
||||||
|
if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
def remove_marks(text):
|
def remove_marks(text):
|
||||||
@ -230,6 +241,21 @@ def cedillas_to_commas(text):
|
|||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def sub_zeroes(match):
|
||||||
|
"""
|
||||||
|
Given a regex match, return what it matched with digits replaced by
|
||||||
|
zeroes.
|
||||||
|
"""
|
||||||
|
return DIGIT_RE.sub('0', match.group(0))
|
||||||
|
|
||||||
|
|
||||||
|
def smash_numbers(text):
|
||||||
|
"""
|
||||||
|
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||||
|
distinguish the frequencies of thousands of numbers.
|
||||||
|
"""
|
||||||
|
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user