mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Don't smash numbers in *all* tokenization, just when looking up freqs
I forgot momentarily that the output of the tokenizer is used by other code.
This commit is contained in:
parent
3cb3c38f47
commit
573ecc53d0
@ -146,9 +146,15 @@ def test_casefolding():
|
||||
|
||||
|
||||
def test_number_smashing():
|
||||
eq_(tokenize('1', 'en'), ['1'])
|
||||
eq_(tokenize('3.14', 'en'), ['0.00'])
|
||||
eq_(tokenize('24601', 'en'), ['00000'])
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||
['715', 'crσσks', 'by', 'bon', 'iver'])
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
|
||||
['000', 'crσσks', 'by', 'bon', 'iver'])
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
|
||||
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
||||
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
|
||||
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
|
||||
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
|
@ -216,7 +216,7 @@ def iter_wordlist(lang, wordlist='combined'):
|
||||
_wf_cache = {}
|
||||
|
||||
def _word_frequency(word, lang, wordlist, minimum):
|
||||
tokens = tokenize(word, lang)
|
||||
tokens = tokenize(word, lang, combine_numbers=True)
|
||||
if not tokens:
|
||||
return minimum
|
||||
|
||||
|
@ -101,7 +101,7 @@ DIGIT_RE = regex.compile('\d')
|
||||
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||
|
||||
|
||||
def simple_tokenize(text, include_punctuation=False):
|
||||
def simple_tokenize(text, include_punctuation=False, combine_numbers=False):
|
||||
"""
|
||||
Tokenize the given text using a straightforward, Unicode-aware token
|
||||
expression.
|
||||
@ -121,6 +121,11 @@ def simple_tokenize(text, include_punctuation=False):
|
||||
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||||
tokens.
|
||||
|
||||
- If `combine_numbers` is True, then multi-digit numbers will be replaced
|
||||
by strings of zeroes. When looking up word frequencies, this allows all
|
||||
numbers of the same length to be treated as the same "word", avoiding
|
||||
unnecessarily sparse data.
|
||||
|
||||
- It breaks on all spaces, even the "non-breaking" ones.
|
||||
|
||||
- It aims to keep marks together with words, so that they aren't erroneously
|
||||
@ -131,18 +136,23 @@ def simple_tokenize(text, include_punctuation=False):
|
||||
would end up in its own token, which is worse.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
if combine_numbers:
|
||||
postprocess = smash_numbers
|
||||
else:
|
||||
postprocess = _identity
|
||||
if include_punctuation:
|
||||
return [
|
||||
smash_numbers(token.casefold())
|
||||
postprocess(token.casefold())
|
||||
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
||||
]
|
||||
else:
|
||||
return [
|
||||
smash_numbers(token.strip("'").casefold())
|
||||
postprocess(token.strip("'").casefold())
|
||||
for token in TOKEN_RE.findall(text)
|
||||
]
|
||||
|
||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||
def tokenize_mecab_language(text, lang, include_punctuation=False,
|
||||
combine_numbers=False):
|
||||
"""
|
||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||
"""
|
||||
@ -151,22 +161,31 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
if combine_numbers:
|
||||
postprocess = smash_numbers
|
||||
else:
|
||||
postprocess = _identity
|
||||
tokens = mecab_tokenize(text, lang)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [smash_numbers(token.casefold()) for token in tokens
|
||||
return [postprocess(token.casefold()) for token in tokens
|
||||
if token_expr.match(token)]
|
||||
|
||||
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False,
|
||||
combine_numbers=False):
|
||||
"""
|
||||
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
||||
"""
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
if combine_numbers:
|
||||
postprocess = smash_numbers
|
||||
else:
|
||||
postprocess = _identity
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [smash_numbers(token.casefold()) for token in tokens
|
||||
return [postprocess(token.casefold()) for token in tokens
|
||||
if token_expr.match(token)]
|
||||
|
||||
|
||||
@ -255,7 +274,15 @@ def smash_numbers(text):
|
||||
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
||||
|
||||
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
def _identity(text):
|
||||
"""
|
||||
The identity function, as an alternative to smashing numbers.
|
||||
"""
|
||||
return text
|
||||
|
||||
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
||||
combine_numbers=False):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language. Strings that are looked up in wordfreq will be run through
|
||||
@ -270,6 +297,17 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
- CJK scripts: Chinese, Japanese, Korean
|
||||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
||||
|
||||
The options `include_punctuation`, `external_wordlist`, and
|
||||
`combine_numbers` are passed on to the appropriate tokenizer:
|
||||
|
||||
- `include_punctuation` preserves punctuation as tokens, instead of
|
||||
removing it.
|
||||
|
||||
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
|
||||
instead of wordfreq's wordlist.
|
||||
|
||||
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
|
||||
|
||||
|
||||
Alphabetic scripts
|
||||
------------------
|
||||
@ -355,20 +393,20 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
# language
|
||||
lang = lang.split('-')[0]
|
||||
if lang == 'ja' or lang == 'ko':
|
||||
return tokenize_mecab_language(text, lang, include_punctuation)
|
||||
return tokenize_mecab_language(text, lang, include_punctuation, combine_numbers)
|
||||
elif lang == 'zh':
|
||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||
return chinese_tokenize(text, include_punctuation, external_wordlist, combine_numbers)
|
||||
elif lang == 'tr':
|
||||
return simple_tokenize(preprocess_turkish(text), include_punctuation)
|
||||
return simple_tokenize(preprocess_turkish(text), include_punctuation, combine_numbers)
|
||||
elif lang == 'ro':
|
||||
return simple_tokenize(preprocess_romanian(text), include_punctuation)
|
||||
return simple_tokenize(preprocess_romanian(text), include_punctuation, combine_numbers)
|
||||
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
|
||||
# These are the three language codes that could include Serbian text,
|
||||
# which could be in Cyrillic.
|
||||
return simple_tokenize(preprocess_serbian(text), include_punctuation)
|
||||
return simple_tokenize(preprocess_serbian(text), include_punctuation, combine_numbers)
|
||||
elif lang in ABJAD_LANGUAGES:
|
||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
return simple_tokenize(text, include_punctuation, combine_numbers)
|
||||
else:
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
return simple_tokenize(text, include_punctuation, combine_numbers)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user