mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Don't smash numbers in *all* tokenization, just when looking up freqs
I forgot momentarily that the output of the tokenizer is used by other code.
This commit is contained in:
parent
3cb3c38f47
commit
573ecc53d0
@ -146,9 +146,15 @@ def test_casefolding():
|
|||||||
|
|
||||||
|
|
||||||
def test_number_smashing():
|
def test_number_smashing():
|
||||||
eq_(tokenize('1', 'en'), ['1'])
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||||
eq_(tokenize('3.14', 'en'), ['0.00'])
|
['715', 'crσσks', 'by', 'bon', 'iver'])
|
||||||
eq_(tokenize('24601', 'en'), ['00000'])
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
|
||||||
|
['000', 'crσσks', 'by', 'bon', 'iver'])
|
||||||
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
|
||||||
|
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
||||||
|
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
|
||||||
|
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
|
||||||
|
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_freq():
|
def test_phrase_freq():
|
||||||
|
@ -216,7 +216,7 @@ def iter_wordlist(lang, wordlist='combined'):
|
|||||||
_wf_cache = {}
|
_wf_cache = {}
|
||||||
|
|
||||||
def _word_frequency(word, lang, wordlist, minimum):
|
def _word_frequency(word, lang, wordlist, minimum):
|
||||||
tokens = tokenize(word, lang)
|
tokens = tokenize(word, lang, combine_numbers=True)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return minimum
|
return minimum
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ DIGIT_RE = regex.compile('\d')
|
|||||||
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text, include_punctuation=False):
|
def simple_tokenize(text, include_punctuation=False, combine_numbers=False):
|
||||||
"""
|
"""
|
||||||
Tokenize the given text using a straightforward, Unicode-aware token
|
Tokenize the given text using a straightforward, Unicode-aware token
|
||||||
expression.
|
expression.
|
||||||
@ -121,6 +121,11 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||||||
tokens.
|
tokens.
|
||||||
|
|
||||||
|
- If `combine_numbers` is True, then multi-digit numbers will be replaced
|
||||||
|
by strings of zeroes. When looking up word frequencies, this allows all
|
||||||
|
numbers of the same length to be treated as the same "word", avoiding
|
||||||
|
unnecessarily sparse data.
|
||||||
|
|
||||||
- It breaks on all spaces, even the "non-breaking" ones.
|
- It breaks on all spaces, even the "non-breaking" ones.
|
||||||
|
|
||||||
- It aims to keep marks together with words, so that they aren't erroneously
|
- It aims to keep marks together with words, so that they aren't erroneously
|
||||||
@ -131,18 +136,23 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
would end up in its own token, which is worse.
|
would end up in its own token, which is worse.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
|
if combine_numbers:
|
||||||
|
postprocess = smash_numbers
|
||||||
|
else:
|
||||||
|
postprocess = _identity
|
||||||
if include_punctuation:
|
if include_punctuation:
|
||||||
return [
|
return [
|
||||||
smash_numbers(token.casefold())
|
postprocess(token.casefold())
|
||||||
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
return [
|
return [
|
||||||
smash_numbers(token.strip("'").casefold())
|
postprocess(token.strip("'").casefold())
|
||||||
for token in TOKEN_RE.findall(text)
|
for token in TOKEN_RE.findall(text)
|
||||||
]
|
]
|
||||||
|
|
||||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
def tokenize_mecab_language(text, lang, include_punctuation=False,
|
||||||
|
combine_numbers=False):
|
||||||
"""
|
"""
|
||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||||
"""
|
"""
|
||||||
@ -151,22 +161,31 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
|
|||||||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
||||||
if mecab_tokenize is None:
|
if mecab_tokenize is None:
|
||||||
from wordfreq.mecab import mecab_tokenize
|
from wordfreq.mecab import mecab_tokenize
|
||||||
|
if combine_numbers:
|
||||||
|
postprocess = smash_numbers
|
||||||
|
else:
|
||||||
|
postprocess = _identity
|
||||||
tokens = mecab_tokenize(text, lang)
|
tokens = mecab_tokenize(text, lang)
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
return [smash_numbers(token.casefold()) for token in tokens
|
return [postprocess(token.casefold()) for token in tokens
|
||||||
if token_expr.match(token)]
|
if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False,
|
||||||
|
combine_numbers=False):
|
||||||
"""
|
"""
|
||||||
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
||||||
"""
|
"""
|
||||||
global jieba_tokenize
|
global jieba_tokenize
|
||||||
if jieba_tokenize is None:
|
if jieba_tokenize is None:
|
||||||
from wordfreq.chinese import jieba_tokenize
|
from wordfreq.chinese import jieba_tokenize
|
||||||
|
if combine_numbers:
|
||||||
|
postprocess = smash_numbers
|
||||||
|
else:
|
||||||
|
postprocess = _identity
|
||||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
return [smash_numbers(token.casefold()) for token in tokens
|
return [postprocess(token.casefold()) for token in tokens
|
||||||
if token_expr.match(token)]
|
if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
@ -255,7 +274,15 @@ def smash_numbers(text):
|
|||||||
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
def _identity(text):
|
||||||
|
"""
|
||||||
|
The identity function, as an alternative to smashing numbers.
|
||||||
|
"""
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
||||||
|
combine_numbers=False):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
the language. Strings that are looked up in wordfreq will be run through
|
the language. Strings that are looked up in wordfreq will be run through
|
||||||
@ -270,6 +297,17 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
- CJK scripts: Chinese, Japanese, Korean
|
- CJK scripts: Chinese, Japanese, Korean
|
||||||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
||||||
|
|
||||||
|
The options `include_punctuation`, `external_wordlist`, and
|
||||||
|
`combine_numbers` are passed on to the appropriate tokenizer:
|
||||||
|
|
||||||
|
- `include_punctuation` preserves punctuation as tokens, instead of
|
||||||
|
removing it.
|
||||||
|
|
||||||
|
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
|
||||||
|
instead of wordfreq's wordlist.
|
||||||
|
|
||||||
|
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
|
||||||
|
|
||||||
|
|
||||||
Alphabetic scripts
|
Alphabetic scripts
|
||||||
------------------
|
------------------
|
||||||
@ -355,20 +393,20 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
# language
|
# language
|
||||||
lang = lang.split('-')[0]
|
lang = lang.split('-')[0]
|
||||||
if lang == 'ja' or lang == 'ko':
|
if lang == 'ja' or lang == 'ko':
|
||||||
return tokenize_mecab_language(text, lang, include_punctuation)
|
return tokenize_mecab_language(text, lang, include_punctuation, combine_numbers)
|
||||||
elif lang == 'zh':
|
elif lang == 'zh':
|
||||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
return chinese_tokenize(text, include_punctuation, external_wordlist, combine_numbers)
|
||||||
elif lang == 'tr':
|
elif lang == 'tr':
|
||||||
return simple_tokenize(preprocess_turkish(text), include_punctuation)
|
return simple_tokenize(preprocess_turkish(text), include_punctuation, combine_numbers)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
return simple_tokenize(preprocess_romanian(text), include_punctuation)
|
return simple_tokenize(preprocess_romanian(text), include_punctuation, combine_numbers)
|
||||||
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
|
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
|
||||||
# These are the three language codes that could include Serbian text,
|
# These are the three language codes that could include Serbian text,
|
||||||
# which could be in Cyrillic.
|
# which could be in Cyrillic.
|
||||||
return simple_tokenize(preprocess_serbian(text), include_punctuation)
|
return simple_tokenize(preprocess_serbian(text), include_punctuation, combine_numbers)
|
||||||
elif lang in ABJAD_LANGUAGES:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
return simple_tokenize(text, include_punctuation)
|
return simple_tokenize(text, include_punctuation, combine_numbers)
|
||||||
else:
|
else:
|
||||||
return simple_tokenize(text, include_punctuation)
|
return simple_tokenize(text, include_punctuation, combine_numbers)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user