From 963e0ff7852c0216079e48daf75e1c5840efb810 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 15 Sep 2015 13:26:09 -0400 Subject: [PATCH] refactor the tokenizer, add `include_punctuation` option Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119 --- tests/test.py | 26 +++++++++++------ wordfreq/tokens.py | 70 +++++++++++++++++++++++++++++----------------- 2 files changed, 61 insertions(+), 35 deletions(-) diff --git a/tests/test.py b/tests/test.py index dd26750..0013dcb 100644 --- a/tests/test.py +++ b/tests/test.py @@ -21,17 +21,19 @@ def test_languages(): avail = available_languages() assert_greater(len(avail), 15) - # Laughter is the universal language + # Laughter is the universal language. Look up either 'lol' or '笑' in each + # language and make sure it has a non-zero frequency. for lang in avail: - if lang not in {'zh', 'ja'}: - # we do not have enough Chinese data - # Japanese people do not lol - assert_greater(word_frequency('lol', lang), 0) + if lang in {'zh', 'ja'}: + text = '笑' + else: + text = 'lol' + assert_greater(word_frequency(text, lang), 0) - # Make up a weirdly verbose language code and make sure - # we still get it - new_lang_code = '%s-001-x-fake-extension' % lang.upper() - assert_greater(word_frequency('lol', new_lang_code), 0) + # Make up a weirdly verbose language code and make sure + # we still get it + new_lang_code = '%s-001-x-fake-extension' % lang.upper() + assert_greater(word_frequency(text, new_lang_code), 0) def test_twitter(): @@ -98,6 +100,9 @@ def test_tokenization(): # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) + + eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), + ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) # Certain punctuation does not inherently split a word. eq_(tokenize("Anything is possible at zombo.com", 'en'), @@ -108,6 +113,9 @@ def test_tokenization(): eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) + eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), + ['this', 'text', 'has', '...', 'punctuation', ':)']) + def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index a2c308c..ad64bcd 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -39,10 +39,16 @@ TOKEN_RE = regex.compile(r""" (?:\B\S)* """, regex.V1 | regex.WORD | regex.VERBOSE) +TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" + [\p{IsIdeo}\p{Script=Hiragana}]+ | + [\p{punct}]+ | + \S(?:\B\S)* +""", regex.V1 | regex.WORD | regex.VERBOSE) + ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) -def simple_tokenize(text): +def simple_tokenize(text, include_punctuation=False): """ Tokenize the given text using a straightforward, Unicode-aware token expression. @@ -57,22 +63,44 @@ def simple_tokenize(text): ideograms and hiragana) relatively untokenized, instead of splitting each character into its own token. - - It outputs only the tokens that start with a word-like character, or - miscellaneous symbols such as emoji. + - If `include_punctuation` is False (the default), it outputs only the + tokens that start with a word-like character, or miscellaneous symbols + such as emoji. If `include_punctuation` is True, it outputs all non-space + tokens. - It breaks on all spaces, even the "non-breaking" ones. """ text = unicodedata.normalize('NFC', text) - return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] + token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE + return [token.strip("'").casefold() for token in token_expr.findall(text)] -def turkish_tokenize(text): +def turkish_tokenize(text, include_punctuation=False): """ Like `simple_tokenize`, but modifies i's so that they case-fold correctly in Turkish. """ text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') - return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] + token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE + return [token.strip("'").casefold() for token in token_expr.findall(text)] + + +def japanese_tokenize(text, include_punctuation=False): + global mecab_tokenize + if mecab_tokenize is None: + from wordfreq.japanese import mecab_tokenize + tokens = mecab_tokenize(text) + token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE + return [token.casefold() for token in tokens if token_expr.match(token)] + + +def chinese_tokenize(text, include_punctuation=False, external_wordlist=False): + global jieba_tokenize + if jieba_tokenize is None: + from wordfreq.chinese import jieba_tokenize + tokens = jieba_tokenize(text, external_wordlist=external_wordlist) + token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE + return [token.casefold() for token in tokens if token_expr.match(token)] def remove_arabic_marks(text): @@ -89,7 +117,7 @@ def remove_arabic_marks(text): mecab_tokenize = None jieba_tokenize = None -def tokenize(text, lang, external_wordlist=False): +def tokenize(text, lang, include_punctuation=False, external_wordlist=False): """ Tokenize this text in a way that's relatively simple but appropriate for the language. @@ -124,24 +152,14 @@ def tokenize(text, lang, external_wordlist=False): first, so that they can be expected to match the data. """ if lang == 'ja': - global mecab_tokenize - if mecab_tokenize is None: - from wordfreq.japanese import mecab_tokenize - tokens = mecab_tokenize(text) - return [token.casefold() for token in tokens if TOKEN_RE.match(token)] - - if lang == 'zh': - global jieba_tokenize - if jieba_tokenize is None: - from wordfreq.chinese import jieba_tokenize - tokens = jieba_tokenize(text, external_wordlist=external_wordlist) - return [token.casefold() for token in tokens if TOKEN_RE.match(token)] - - if lang == 'tr': - return turkish_tokenize(text) - - if lang == 'ar': + return japanese_tokenize(text, include_punctuation) + elif lang == 'zh': + return chinese_tokenize(text, include_punctuation, external_wordlist) + elif lang == 'tr': + return turkish_tokenize(text, include_punctuation) + elif lang == 'ar': text = remove_arabic_marks(unicodedata.normalize('NFKC', text)) - - return simple_tokenize(text) + return simple_tokenize(text, include_punctuation) + else: + return simple_tokenize(text, include_punctuation)