mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
add external_wordlist
option to tokenize
This commit is contained in:
parent
3cb3061e06
commit
669bd16c13
@ -25,12 +25,33 @@ def test_tokens():
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh'),
|
||||
[
|
||||
# he / is / in history / #6 / counter for people
|
||||
'他', '是', '历史上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
# he / is / in history / #6 / counter for people
|
||||
'他', '是', '历史上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
|
||||
# Jieba's original tokenizer knows a lot of names, it sees.
|
||||
eq_(
|
||||
tokenize(hobart, 'zh', external_wordlist=True),
|
||||
['加勒特', '霍巴特']
|
||||
)
|
||||
|
||||
# We get almost the same tokens from the sentence using Jieba's own
|
||||
# wordlist, but it tokenizes "in history" as two words and
|
||||
# "sixth person" as one.
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh', external_wordlist=True),
|
||||
[
|
||||
# he / is / history / in / sixth person
|
||||
'他', '是', '历史', '上', '第六位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -4,16 +4,34 @@ import jieba
|
||||
|
||||
|
||||
jieba_tokenizer = None
|
||||
jieba_orig_tokenizer = None
|
||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||
|
||||
|
||||
def jieba_tokenize(text):
|
||||
global jieba_tokenizer
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
def jieba_tokenize(text, external_wordlist=False):
|
||||
"""
|
||||
If `external_wordlist` is False, this will tokenize the given text with our
|
||||
custom Jieba dictionary, which contains only the strings that have
|
||||
frequencies in wordfreq.
|
||||
|
||||
This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
|
||||
the purpose of looking up frequencies, it's ideal.
|
||||
|
||||
If `external_wordlist` is True, this will use the largest version of
|
||||
Jieba's original dictionary, so its results will be independent of the
|
||||
data in wordfreq.
|
||||
"""
|
||||
global jieba_tokenizer, jieba_orig_tokenizer
|
||||
if external_wordlist:
|
||||
if jieba_orig_tokenizer is None:
|
||||
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
|
||||
return jieba_orig_tokenizer.lcut(text)
|
||||
else:
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
|
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -89,14 +89,16 @@ def remove_arabic_marks(text):
|
||||
|
||||
mecab_tokenize = None
|
||||
jieba_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
def tokenize(text, lang, external_wordlist=False):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means:
|
||||
|
||||
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
||||
- Chinese will be mapped to Simplified Chinese characters and tokenized
|
||||
using the jieba tokenizer, on a custom word list of words that can be
|
||||
looked up in wordfreq.
|
||||
- Japanese will be delegated to the external mecab-python module.
|
||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||
language will only split on punctuation and script boundaries, giving
|
||||
@ -111,6 +113,13 @@ def tokenize(text, lang):
|
||||
as Arabic will be normalized more strongly and have combining marks and
|
||||
tatweels removed.
|
||||
|
||||
If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
|
||||
not be used for tokenization. Instead, it will use the large wordlist
|
||||
packaged with the Jieba tokenizer, and it will leave Traditional Chinese
|
||||
characters as is. This will probably give more accurate tokenization, but
|
||||
the resulting tokens won't necessarily have word frequencies that can be
|
||||
looked up.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
@ -125,10 +134,9 @@ def tokenize(text, lang):
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text)
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
|
||||
if lang == 'tr':
|
||||
return turkish_tokenize(text)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user