add external_wordlist option to tokenize

This commit is contained in:
Rob Speer 2015-09-10 18:09:41 -04:00
parent 3cb3061e06
commit 669bd16c13
4 changed files with 349108 additions and 15 deletions

View File

@ -25,12 +25,33 @@ def test_tokens():
eq_( eq_(
tokenize(fact_simplified, 'zh'), tokenize(fact_simplified, 'zh'),
[ [
# he / is / in history / #6 / counter for people # he / is / in history / #6 / counter for people
'', '', '历史上', '第六', '', '', '', '历史上', '第六', '',
# during / term of office / in / die # during / term of office / in / die
'', '任期', '', '去世', '', '任期', '', '去世',
# of / U.S. / deputy / president # of / U.S. / deputy / president
'', '美国', '', '总统' '', '美国', '', '总统'
]
)
# Jieba's original tokenizer knows a lot of names, it sees.
eq_(
tokenize(hobart, 'zh', external_wordlist=True),
['加勒特', '霍巴特']
)
# We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one.
eq_(
tokenize(fact_simplified, 'zh', external_wordlist=True),
[
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
] ]
) )

View File

@ -4,16 +4,34 @@ import jieba
jieba_tokenizer = None jieba_tokenizer = None
jieba_orig_tokenizer = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
def simplify_chinese(text): def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold() return text.translate(SIMPLIFIED_MAP).casefold()
def jieba_tokenize(text): def jieba_tokenize(text, external_wordlist=False):
global jieba_tokenizer """
if jieba_tokenizer is None: If `external_wordlist` is False, this will tokenize the given text with our
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) custom Jieba dictionary, which contains only the strings that have
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False) frequencies in wordfreq.
This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
the purpose of looking up frequencies, it's ideal.
If `external_wordlist` is True, this will use the largest version of
Jieba's original dictionary, so its results will be independent of the
data in wordfreq.
"""
global jieba_tokenizer, jieba_orig_tokenizer
if external_wordlist:
if jieba_orig_tokenizer is None:
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
return jieba_orig_tokenizer.lcut(text)
else:
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)

349046
wordfreq/data/jieba_zh_orig.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -89,14 +89,16 @@ def remove_arabic_marks(text):
mecab_tokenize = None mecab_tokenize = None
jieba_tokenize = None jieba_tokenize = None
def tokenize(text, lang): def tokenize(text, lang, external_wordlist=False):
""" """
Tokenize this text in a way that's relatively simple but appropriate for Tokenize this text in a way that's relatively simple but appropriate for
the language. the language.
So far, this means: So far, this means:
- Chinese is presumed to already be tokenized. (Sorry. It's hard.) - Chinese will be mapped to Simplified Chinese characters and tokenized
using the jieba tokenizer, on a custom word list of words that can be
looked up in wordfreq.
- Japanese will be delegated to the external mecab-python module. - Japanese will be delegated to the external mecab-python module.
- Chinese or Japanese texts that aren't identified as the appropriate - Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving language will only split on punctuation and script boundaries, giving
@ -111,6 +113,13 @@ def tokenize(text, lang):
as Arabic will be normalized more strongly and have combining marks and as Arabic will be normalized more strongly and have combining marks and
tatweels removed. tatweels removed.
If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
not be used for tokenization. Instead, it will use the large wordlist
packaged with the Jieba tokenizer, and it will leave Traditional Chinese
characters as is. This will probably give more accurate tokenization, but
the resulting tokens won't necessarily have word frequencies that can be
looked up.
Strings that are looked up in wordfreq will be run through this function Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data. first, so that they can be expected to match the data.
""" """
@ -125,10 +134,9 @@ def tokenize(text, lang):
global jieba_tokenize global jieba_tokenize
if jieba_tokenize is None: if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text) tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)] return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'tr': if lang == 'tr':
return turkish_tokenize(text) return turkish_tokenize(text)