add external_wordlist option to tokenize

Former-commit-id: 669bd16c13
This commit is contained in:
Robyn Speer 2015-09-10 18:09:41 -04:00
parent f2be213933
commit 1adbb1aaf1
4 changed files with 349108 additions and 15 deletions

View File

@ -34,6 +34,27 @@ def test_tokens():
]
)
# Jieba's original tokenizer knows a lot of names, it sees.
eq_(
tokenize(hobart, 'zh', external_wordlist=True),
['加勒特', '霍巴特']
)
# We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one.
eq_(
tokenize(fact_simplified, 'zh', external_wordlist=True),
[
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
)
# You match the same tokens if you look it up in Traditional Chinese.
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
assert_greater(word_frequency(fact_traditional, 'zh'), 0)

View File

@ -4,16 +4,34 @@ import jieba
jieba_tokenizer = None
jieba_orig_tokenizer = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold()
def jieba_tokenize(text):
global jieba_tokenizer
def jieba_tokenize(text, external_wordlist=False):
"""
If `external_wordlist` is False, this will tokenize the given text with our
custom Jieba dictionary, which contains only the strings that have
frequencies in wordfreq.
This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
the purpose of looking up frequencies, it's ideal.
If `external_wordlist` is True, this will use the largest version of
Jieba's original dictionary, so its results will be independent of the
data in wordfreq.
"""
global jieba_tokenizer, jieba_orig_tokenizer
if external_wordlist:
if jieba_orig_tokenizer is None:
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
return jieba_orig_tokenizer.lcut(text)
else:
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)

349046
wordfreq/data/jieba_zh_orig.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -89,14 +89,16 @@ def remove_arabic_marks(text):
mecab_tokenize = None
jieba_tokenize = None
def tokenize(text, lang):
def tokenize(text, lang, external_wordlist=False):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language.
So far, this means:
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
- Chinese will be mapped to Simplified Chinese characters and tokenized
using the jieba tokenizer, on a custom word list of words that can be
looked up in wordfreq.
- Japanese will be delegated to the external mecab-python module.
- Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving
@ -111,6 +113,13 @@ def tokenize(text, lang):
as Arabic will be normalized more strongly and have combining marks and
tatweels removed.
If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
not be used for tokenization. Instead, it will use the large wordlist
packaged with the Jieba tokenizer, and it will leave Traditional Chinese
characters as is. This will probably give more accurate tokenization, but
the resulting tokens won't necessarily have word frequencies that can be
looked up.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
"""
@ -125,10 +134,9 @@ def tokenize(text, lang):
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text)
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'tr':
return turkish_tokenize(text)