Allow MeCab to work in Japanese or Korean without the other

This commit is contained in:
Robyn Speer 2016-08-19 11:41:35 -04:00
parent e4b32afa18
commit e1d6e7d96f

View File

@ -52,11 +52,13 @@ def make_mecab_analyzer(names):
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names)) return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
# Instantiate the MeCab analyzers for each language. # Describe how to get the MeCab analyzers for each language.
MECAB_ANALYZERS = { MECAB_DICTIONARY_NAMES = {
'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'ipadic-utf8']), 'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic']) 'ko': ['mecab-ko-dic', 'ko-dic']
} }
# The constructed analyzers will go in this dictionary.
MECAB_ANALYZERS = {}
def mecab_tokenize(text, lang): def mecab_tokenize(text, lang):
@ -68,8 +70,11 @@ def mecab_tokenize(text, lang):
contains the same table that the command-line version of MeCab would output. contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table. We find the tokens in the first column of this table.
""" """
if lang not in MECAB_ANALYZERS: if lang not in MECAB_DICTIONARY_NAMES:
raise ValueError("Can't run MeCab on language %r" % lang) raise ValueError("Can't run MeCab on language %r" % lang)
if lang not in MECAB_ANALYZERS:
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
analyzer = MECAB_ANALYZERS[lang] analyzer = MECAB_ANALYZERS[lang]
text = unicodedata.normalize('NFKC', text.strip()) text = unicodedata.normalize('NFKC', text.strip())
analyzed = analyzer.parse(text) analyzed = analyzer.parse(text)