Use Python packages to find dictionaries for MeCab

This commit is contained in:
Robyn Speer 2021-02-18 18:18:06 -05:00
parent ed23bf3ebe
commit de636a804e
2 changed files with 17 additions and 54 deletions

View File

@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = [ dependencies = [
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04' 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
] ]
setup( setup(
name="wordfreq", name="wordfreq",
version='2.4.1', version='2.5.0',
maintainer='Robyn Speer', maintainer='Robyn Speer',
maintainer_email='rspeer@luminoso.com', maintainer_email='rspeer@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',
@ -55,8 +55,8 @@ setup(
# #
# Similarly, jieba is required for Chinese word frequencies. # Similarly, jieba is required for Chinese word frequencies.
extras_require={ extras_require={
'mecab': 'mecab-python3', 'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
'jieba': 'jieba >= 0.42' 'jieba': ['jieba >= 0.42']
}, },
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'], tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
) )

View File

@ -8,56 +8,21 @@ import os
MAX_PATH_LENGTH = 58 MAX_PATH_LENGTH = 58
def find_mecab_dictionary(names): def make_mecab_analyzer(lang):
""" """
Find a MeCab dictionary with a given name. The dictionary has to be Get a MeCab analyzer object, given the language code of the language to
installed separately -- see wordfreq's README for instructions. analyze.
""" """
suggested_pkg = names[0] if lang == 'ko':
paths = [ import mecab_ko_dic
os.path.expanduser('~/.local/lib/mecab/dic'), return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
'/var/lib/mecab/dic', elif lang == 'ja':
'/var/local/lib/mecab/dic', import ipadic
'/usr/lib/mecab/dic', return MeCab.Tagger(ipadic.MECAB_ARGS)
'/usr/local/lib/mecab/dic', else:
'/usr/lib/x86_64-linux-gnu/mecab/dic', raise ValueError("Can't run MeCab on language {lang}".format(lang))
]
full_paths = [os.path.join(path, name) for path in paths for name in names]
checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
for path in checked_paths:
if os.path.exists(path):
return path
error_lines = [
"Couldn't find the MeCab dictionary named %r." % suggested_pkg,
"You should download or use your system's package manager to install",
"the %r package." % suggested_pkg,
"",
"We looked in the following locations:"
] + ["\t%s" % path for path in checked_paths]
skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
if skipped_paths:
error_lines += [
"We had to skip these paths that are too long for MeCab to find:",
] + ["\t%s" % path for path in skipped_paths]
raise OSError('\n'.join(error_lines))
def make_mecab_analyzer(names):
"""
Get a MeCab analyzer object, given a list of names the dictionary might
have.
"""
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
# Describe how to get the MeCab analyzers for each language.
MECAB_DICTIONARY_NAMES = {
'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
'ko': ['mecab-ko-dic', 'ko-dic']
}
# The constructed analyzers will go in this dictionary. # The constructed analyzers will go in this dictionary.
MECAB_ANALYZERS = {} MECAB_ANALYZERS = {}
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
contains the same table that the command-line version of MeCab would output. contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table. We find the tokens in the first column of this table.
""" """
if lang not in MECAB_DICTIONARY_NAMES:
raise ValueError("Can't run MeCab on language %r" % lang)
if lang not in MECAB_ANALYZERS: if lang not in MECAB_ANALYZERS:
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang]) MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
analyzer = MECAB_ANALYZERS[lang] analyzer = MECAB_ANALYZERS[lang]
text = unicodedata.normalize('NFKC', text.strip()) text = unicodedata.normalize('NFKC', text.strip())