From abd0820a325dbf543dd14076ff0b6d3fab478dbb Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 11 Jan 2017 19:04:19 -0500 Subject: [PATCH] Handle smashing numbers only at the end of tokenize(). This does make the code a lot clearer. --- wordfreq/tokens.py | 59 +++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 42 deletions(-) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index eee1762..de4b566 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -101,7 +101,7 @@ DIGIT_RE = regex.compile('\d') MULTI_DIGIT_RE = regex.compile('\d[\d.,]+') -def simple_tokenize(text, include_punctuation=False, combine_numbers=False): +def simple_tokenize(text, include_punctuation=False): """ Tokenize the given text using a straightforward, Unicode-aware token expression. @@ -121,11 +121,6 @@ def simple_tokenize(text, include_punctuation=False, combine_numbers=False): such as emoji. If `include_punctuation` is True, it outputs all non-space tokens. - - If `combine_numbers` is True, then multi-digit numbers will be replaced - by strings of zeroes. When looking up word frequencies, this allows all - numbers of the same length to be treated as the same "word", avoiding - unnecessarily sparse data. - - It breaks on all spaces, even the "non-breaking" ones. - It aims to keep marks together with words, so that they aren't erroneously @@ -136,23 +131,18 @@ def simple_tokenize(text, include_punctuation=False, combine_numbers=False): would end up in its own token, which is worse. """ text = unicodedata.normalize('NFC', text) - if combine_numbers: - postprocess = smash_numbers - else: - postprocess = _identity if include_punctuation: return [ - postprocess(token.casefold()) + token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text) ] else: return [ - postprocess(token.strip("'").casefold()) + token.strip("'").casefold() for token in TOKEN_RE.findall(text) ] -def tokenize_mecab_language(text, lang, include_punctuation=False, - combine_numbers=False): +def tokenize_mecab_language(text, lang, include_punctuation=False): """ Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary. """ @@ -161,32 +151,21 @@ def tokenize_mecab_language(text, lang, include_punctuation=False, raise ValueError("Only Japanese and Korean can be tokenized using MeCab") if mecab_tokenize is None: from wordfreq.mecab import mecab_tokenize - if combine_numbers: - postprocess = smash_numbers - else: - postprocess = _identity tokens = mecab_tokenize(text, lang) token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE - return [postprocess(token.casefold()) for token in tokens - if token_expr.match(token)] + return [token.casefold() for token in tokens if token_expr.match(token)] -def chinese_tokenize(text, include_punctuation=False, external_wordlist=False, - combine_numbers=False): +def chinese_tokenize(text, include_punctuation=False, external_wordlist=False): """ Tokenize Chinese text, initializing the Jieba tokenizer if necessary. """ global jieba_tokenize if jieba_tokenize is None: from wordfreq.chinese import jieba_tokenize - if combine_numbers: - postprocess = smash_numbers - else: - postprocess = _identity tokens = jieba_tokenize(text, external_wordlist=external_wordlist) token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE - return [postprocess(token.casefold()) for token in tokens - if token_expr.match(token)] + return [token.casefold() for token in tokens if token_expr.match(token)] def remove_marks(text): @@ -274,13 +253,6 @@ def smash_numbers(text): return MULTI_DIGIT_RE.sub(sub_zeroes, text) -def _identity(text): - """ - The identity function, as an alternative to smashing numbers. - """ - return text - - def tokenize(text, lang, include_punctuation=False, external_wordlist=False, combine_numbers=False): """ @@ -393,20 +365,23 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False, # language lang = lang.split('-')[0] if lang == 'ja' or lang == 'ko': - return tokenize_mecab_language(text, lang, include_punctuation, combine_numbers) + result = tokenize_mecab_language(text, lang, include_punctuation) elif lang == 'zh': - return chinese_tokenize(text, include_punctuation, external_wordlist, combine_numbers) + result = chinese_tokenize(text, include_punctuation, external_wordlist) elif lang == 'tr': - return simple_tokenize(preprocess_turkish(text), include_punctuation, combine_numbers) + result = simple_tokenize(preprocess_turkish(text), include_punctuation) elif lang == 'ro': - return simple_tokenize(preprocess_romanian(text), include_punctuation, combine_numbers) + result = simple_tokenize(preprocess_romanian(text), include_punctuation) elif lang == 'sr' or lang == 'sh' or lang == 'hbs': # These are the three language codes that could include Serbian text, # which could be in Cyrillic. - return simple_tokenize(preprocess_serbian(text), include_punctuation, combine_numbers) + result = simple_tokenize(preprocess_serbian(text), include_punctuation) elif lang in ABJAD_LANGUAGES: text = remove_marks(unicodedata.normalize('NFKC', text)) - return simple_tokenize(text, include_punctuation, combine_numbers) + result = simple_tokenize(text, include_punctuation) else: - return simple_tokenize(text, include_punctuation, combine_numbers) + result = simple_tokenize(text, include_punctuation) + if combine_numbers: + result = [smash_numbers(token) for token in result] + return result