mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
add external_wordlist
option to tokenize
This commit is contained in:
parent
3cb3061e06
commit
669bd16c13
@ -34,6 +34,27 @@ def test_tokens():
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Jieba's original tokenizer knows a lot of names, it sees.
|
||||||
|
eq_(
|
||||||
|
tokenize(hobart, 'zh', external_wordlist=True),
|
||||||
|
['加勒特', '霍巴特']
|
||||||
|
)
|
||||||
|
|
||||||
|
# We get almost the same tokens from the sentence using Jieba's own
|
||||||
|
# wordlist, but it tokenizes "in history" as two words and
|
||||||
|
# "sixth person" as one.
|
||||||
|
eq_(
|
||||||
|
tokenize(fact_simplified, 'zh', external_wordlist=True),
|
||||||
|
[
|
||||||
|
# he / is / history / in / sixth person
|
||||||
|
'他', '是', '历史', '上', '第六位',
|
||||||
|
# during / term of office / in / die
|
||||||
|
'在', '任期', '内', '去世',
|
||||||
|
# of / U.S. / deputy / president
|
||||||
|
'的', '美国', '副', '总统'
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# You match the same tokens if you look it up in Traditional Chinese.
|
# You match the same tokens if you look it up in Traditional Chinese.
|
||||||
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
|
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
|
||||||
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
|
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
|
||||||
|
@ -4,16 +4,34 @@ import jieba
|
|||||||
|
|
||||||
|
|
||||||
jieba_tokenizer = None
|
jieba_tokenizer = None
|
||||||
|
jieba_orig_tokenizer = None
|
||||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||||
|
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||||
|
|
||||||
|
|
||||||
def simplify_chinese(text):
|
def simplify_chinese(text):
|
||||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||||
|
|
||||||
|
|
||||||
def jieba_tokenize(text):
|
def jieba_tokenize(text, external_wordlist=False):
|
||||||
global jieba_tokenizer
|
"""
|
||||||
|
If `external_wordlist` is False, this will tokenize the given text with our
|
||||||
|
custom Jieba dictionary, which contains only the strings that have
|
||||||
|
frequencies in wordfreq.
|
||||||
|
|
||||||
|
This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
|
||||||
|
the purpose of looking up frequencies, it's ideal.
|
||||||
|
|
||||||
|
If `external_wordlist` is True, this will use the largest version of
|
||||||
|
Jieba's original dictionary, so its results will be independent of the
|
||||||
|
data in wordfreq.
|
||||||
|
"""
|
||||||
|
global jieba_tokenizer, jieba_orig_tokenizer
|
||||||
|
if external_wordlist:
|
||||||
|
if jieba_orig_tokenizer is None:
|
||||||
|
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
|
||||||
|
return jieba_orig_tokenizer.lcut(text)
|
||||||
|
else:
|
||||||
if jieba_tokenizer is None:
|
if jieba_tokenizer is None:
|
||||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||||
|
|
||||||
|
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -89,14 +89,16 @@ def remove_arabic_marks(text):
|
|||||||
|
|
||||||
mecab_tokenize = None
|
mecab_tokenize = None
|
||||||
jieba_tokenize = None
|
jieba_tokenize = None
|
||||||
def tokenize(text, lang):
|
def tokenize(text, lang, external_wordlist=False):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
the language.
|
the language.
|
||||||
|
|
||||||
So far, this means:
|
So far, this means:
|
||||||
|
|
||||||
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
- Chinese will be mapped to Simplified Chinese characters and tokenized
|
||||||
|
using the jieba tokenizer, on a custom word list of words that can be
|
||||||
|
looked up in wordfreq.
|
||||||
- Japanese will be delegated to the external mecab-python module.
|
- Japanese will be delegated to the external mecab-python module.
|
||||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||||
language will only split on punctuation and script boundaries, giving
|
language will only split on punctuation and script boundaries, giving
|
||||||
@ -111,6 +113,13 @@ def tokenize(text, lang):
|
|||||||
as Arabic will be normalized more strongly and have combining marks and
|
as Arabic will be normalized more strongly and have combining marks and
|
||||||
tatweels removed.
|
tatweels removed.
|
||||||
|
|
||||||
|
If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
|
||||||
|
not be used for tokenization. Instead, it will use the large wordlist
|
||||||
|
packaged with the Jieba tokenizer, and it will leave Traditional Chinese
|
||||||
|
characters as is. This will probably give more accurate tokenization, but
|
||||||
|
the resulting tokens won't necessarily have word frequencies that can be
|
||||||
|
looked up.
|
||||||
|
|
||||||
Strings that are looked up in wordfreq will be run through this function
|
Strings that are looked up in wordfreq will be run through this function
|
||||||
first, so that they can be expected to match the data.
|
first, so that they can be expected to match the data.
|
||||||
"""
|
"""
|
||||||
@ -125,10 +134,9 @@ def tokenize(text, lang):
|
|||||||
global jieba_tokenize
|
global jieba_tokenize
|
||||||
if jieba_tokenize is None:
|
if jieba_tokenize is None:
|
||||||
from wordfreq.chinese import jieba_tokenize
|
from wordfreq.chinese import jieba_tokenize
|
||||||
tokens = jieba_tokenize(text)
|
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||||
|
|
||||||
|
|
||||||
if lang == 'tr':
|
if lang == 'tr':
|
||||||
return turkish_tokenize(text)
|
return turkish_tokenize(text)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user