refactor the tokenizer, add include_punctuation option

This commit is contained in:
Rob Speer 2015-09-15 13:26:09 -04:00
parent 669bd16c13
commit e8e6e0a231
2 changed files with 61 additions and 35 deletions

View File

@ -21,17 +21,19 @@ def test_languages():
avail = available_languages()
assert_greater(len(avail), 15)
# Laughter is the universal language
# Laughter is the universal language. Look up either 'lol' or '笑' in each
# language and make sure it has a non-zero frequency.
for lang in avail:
if lang not in {'zh', 'ja'}:
# we do not have enough Chinese data
# Japanese people do not lol
assert_greater(word_frequency('lol', lang), 0)
if lang in {'zh', 'ja'}:
text = ''
else:
text = 'lol'
assert_greater(word_frequency(text, lang), 0)
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency('lol', new_lang_code), 0)
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency(text, new_lang_code), 0)
def test_twitter():
@ -98,6 +100,9 @@ def test_tokenization():
# data
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
# Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'),
@ -108,6 +113,9 @@ def test_tokenization():
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
['this', 'text', 'has', '...', 'punctuation', ':)'])
def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss'])

View File

@ -39,10 +39,16 @@ TOKEN_RE = regex.compile(r"""
(?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
[\p{IsIdeo}\p{Script=Hiragana}]+ |
[\p{punct}]+ |
\S(?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE)
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text):
def simple_tokenize(text, include_punctuation=False):
"""
Tokenize the given text using a straightforward, Unicode-aware token
expression.
@ -57,22 +63,44 @@ def simple_tokenize(text):
ideograms and hiragana) relatively untokenized, instead of splitting each
character into its own token.
- It outputs only the tokens that start with a word-like character, or
miscellaneous symbols such as emoji.
- If `include_punctuation` is False (the default), it outputs only the
tokens that start with a word-like character, or miscellaneous symbols
such as emoji. If `include_punctuation` is True, it outputs all non-space
tokens.
- It breaks on all spaces, even the "non-breaking" ones.
"""
text = unicodedata.normalize('NFC', text)
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
def turkish_tokenize(text):
def turkish_tokenize(text, include_punctuation=False):
"""
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
in Turkish.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
def japanese_tokenize(text, include_punctuation=False):
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
tokens = mecab_tokenize(text)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def remove_arabic_marks(text):
@ -89,7 +117,7 @@ def remove_arabic_marks(text):
mecab_tokenize = None
jieba_tokenize = None
def tokenize(text, lang, external_wordlist=False):
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language.
@ -124,24 +152,14 @@ def tokenize(text, lang, external_wordlist=False):
first, so that they can be expected to match the data.
"""
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
tokens = mecab_tokenize(text)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'zh':
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'tr':
return turkish_tokenize(text)
if lang == 'ar':
return japanese_tokenize(text, include_punctuation)
elif lang == 'zh':
return chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr':
return turkish_tokenize(text, include_punctuation)
elif lang == 'ar':
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text)
return simple_tokenize(text, include_punctuation)
else:
return simple_tokenize(text, include_punctuation)