mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
refactor the tokenizer, add include_punctuation
option
Former-commit-id: e8e6e0a231
This commit is contained in:
parent
1adbb1aaf1
commit
9a007b9948
@ -21,17 +21,19 @@ def test_languages():
|
||||
avail = available_languages()
|
||||
assert_greater(len(avail), 15)
|
||||
|
||||
# Laughter is the universal language
|
||||
# Laughter is the universal language. Look up either 'lol' or '笑' in each
|
||||
# language and make sure it has a non-zero frequency.
|
||||
for lang in avail:
|
||||
if lang not in {'zh', 'ja'}:
|
||||
# we do not have enough Chinese data
|
||||
# Japanese people do not lol
|
||||
assert_greater(word_frequency('lol', lang), 0)
|
||||
if lang in {'zh', 'ja'}:
|
||||
text = '笑'
|
||||
else:
|
||||
text = 'lol'
|
||||
assert_greater(word_frequency(text, lang), 0)
|
||||
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert_greater(word_frequency('lol', new_lang_code), 0)
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert_greater(word_frequency(text, new_lang_code), 0)
|
||||
|
||||
|
||||
def test_twitter():
|
||||
@ -98,6 +100,9 @@ def test_tokenization():
|
||||
# data
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||
@ -108,6 +113,9 @@ def test_tokenization():
|
||||
|
||||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||
|
||||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||
|
@ -39,10 +39,16 @@ TOKEN_RE = regex.compile(r"""
|
||||
(?:\B\S)*
|
||||
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
||||
[\p{punct}]+ |
|
||||
\S(?:\B\S)*
|
||||
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
def simple_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
Tokenize the given text using a straightforward, Unicode-aware token
|
||||
expression.
|
||||
@ -57,22 +63,44 @@ def simple_tokenize(text):
|
||||
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||||
character into its own token.
|
||||
|
||||
- It outputs only the tokens that start with a word-like character, or
|
||||
miscellaneous symbols such as emoji.
|
||||
- If `include_punctuation` is False (the default), it outputs only the
|
||||
tokens that start with a word-like character, or miscellaneous symbols
|
||||
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||||
tokens.
|
||||
|
||||
- It breaks on all spaces, even the "non-breaking" ones.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||
|
||||
|
||||
def turkish_tokenize(text):
|
||||
def turkish_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
||||
in Turkish.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||
|
||||
|
||||
def japanese_tokenize(text, include_punctuation=False):
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
tokens = mecab_tokenize(text)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
def remove_arabic_marks(text):
|
||||
@ -89,7 +117,7 @@ def remove_arabic_marks(text):
|
||||
|
||||
mecab_tokenize = None
|
||||
jieba_tokenize = None
|
||||
def tokenize(text, lang, external_wordlist=False):
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language.
|
||||
@ -124,24 +152,14 @@ def tokenize(text, lang, external_wordlist=False):
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
tokens = mecab_tokenize(text)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
if lang == 'zh':
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
if lang == 'tr':
|
||||
return turkish_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
return japanese_tokenize(text, include_punctuation)
|
||||
elif lang == 'zh':
|
||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||
elif lang == 'tr':
|
||||
return turkish_tokenize(text, include_punctuation)
|
||||
elif lang == 'ar':
|
||||
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
|
||||
|
||||
return simple_tokenize(text)
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
else:
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user