mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Allow MeCab to work in Japanese or Korean without the other
This commit is contained in:
parent
e4b32afa18
commit
e1d6e7d96f
@ -52,11 +52,13 @@ def make_mecab_analyzer(names):
|
|||||||
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
|
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
|
||||||
|
|
||||||
|
|
||||||
# Instantiate the MeCab analyzers for each language.
|
# Describe how to get the MeCab analyzers for each language.
|
||||||
MECAB_ANALYZERS = {
|
MECAB_DICTIONARY_NAMES = {
|
||||||
'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'ipadic-utf8']),
|
'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
|
||||||
'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic'])
|
'ko': ['mecab-ko-dic', 'ko-dic']
|
||||||
}
|
}
|
||||||
|
# The constructed analyzers will go in this dictionary.
|
||||||
|
MECAB_ANALYZERS = {}
|
||||||
|
|
||||||
|
|
||||||
def mecab_tokenize(text, lang):
|
def mecab_tokenize(text, lang):
|
||||||
@ -68,8 +70,11 @@ def mecab_tokenize(text, lang):
|
|||||||
contains the same table that the command-line version of MeCab would output.
|
contains the same table that the command-line version of MeCab would output.
|
||||||
We find the tokens in the first column of this table.
|
We find the tokens in the first column of this table.
|
||||||
"""
|
"""
|
||||||
if lang not in MECAB_ANALYZERS:
|
if lang not in MECAB_DICTIONARY_NAMES:
|
||||||
raise ValueError("Can't run MeCab on language %r" % lang)
|
raise ValueError("Can't run MeCab on language %r" % lang)
|
||||||
|
if lang not in MECAB_ANALYZERS:
|
||||||
|
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
|
||||||
|
|
||||||
analyzer = MECAB_ANALYZERS[lang]
|
analyzer = MECAB_ANALYZERS[lang]
|
||||||
text = unicodedata.normalize('NFKC', text.strip())
|
text = unicodedata.normalize('NFKC', text.strip())
|
||||||
analyzed = analyzer.parse(text)
|
analyzed = analyzer.parse(text)
|
||||||
|
Loading…
Reference in New Issue
Block a user