From 2cc58d68ad07e4a963ae510bd3cc3832c0e2ae10 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 18 Feb 2021 18:18:06 -0500 Subject: [PATCH] Use Python packages to find dictionaries for MeCab --- setup.py | 10 ++++---- wordfreq/mecab.py | 61 ++++++++++------------------------------------- 2 files changed, 17 insertions(+), 54 deletions(-) diff --git a/setup.py b/setup.py index e2b1451..f68404a 100755 --- a/setup.py +++ b/setup.py @@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = [ - 'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04' + 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04' ] setup( name="wordfreq", - version='2.4.1', + version='2.5.0', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', @@ -55,8 +55,8 @@ setup( # # Similarly, jieba is required for Chinese word frequencies. extras_require={ - 'mecab': 'mecab-python3', - 'jieba': 'jieba >= 0.42' + 'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'], + 'jieba': ['jieba >= 0.42'] }, - tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'], + tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'], ) diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index 8607387..fee555c 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -8,56 +8,21 @@ import os MAX_PATH_LENGTH = 58 -def find_mecab_dictionary(names): +def make_mecab_analyzer(lang): """ - Find a MeCab dictionary with a given name. The dictionary has to be - installed separately -- see wordfreq's README for instructions. + Get a MeCab analyzer object, given the language code of the language to + analyze. """ - suggested_pkg = names[0] - paths = [ - os.path.expanduser('~/.local/lib/mecab/dic'), - '/var/lib/mecab/dic', - '/var/local/lib/mecab/dic', - '/usr/lib/mecab/dic', - '/usr/local/lib/mecab/dic', - '/usr/lib/x86_64-linux-gnu/mecab/dic', - ] - full_paths = [os.path.join(path, name) for path in paths for name in names] - checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH] - for path in checked_paths: - if os.path.exists(path): - return path - - error_lines = [ - "Couldn't find the MeCab dictionary named %r." % suggested_pkg, - "You should download or use your system's package manager to install", - "the %r package." % suggested_pkg, - "", - "We looked in the following locations:" - ] + ["\t%s" % path for path in checked_paths] - - skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH] - if skipped_paths: - error_lines += [ - "We had to skip these paths that are too long for MeCab to find:", - ] + ["\t%s" % path for path in skipped_paths] - - raise OSError('\n'.join(error_lines)) + if lang == 'ko': + import mecab_ko_dic + return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS) + elif lang == 'ja': + import ipadic + return MeCab.Tagger(ipadic.MECAB_ARGS) + else: + raise ValueError("Can't run MeCab on language {lang}".format(lang)) -def make_mecab_analyzer(names): - """ - Get a MeCab analyzer object, given a list of names the dictionary might - have. - """ - return MeCab.Tagger('-d %s' % find_mecab_dictionary(names)) - - -# Describe how to get the MeCab analyzers for each language. -MECAB_DICTIONARY_NAMES = { - 'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'], - 'ko': ['mecab-ko-dic', 'ko-dic'] -} # The constructed analyzers will go in this dictionary. MECAB_ANALYZERS = {} @@ -71,10 +36,8 @@ def mecab_tokenize(text, lang): contains the same table that the command-line version of MeCab would output. We find the tokens in the first column of this table. """ - if lang not in MECAB_DICTIONARY_NAMES: - raise ValueError("Can't run MeCab on language %r" % lang) if lang not in MECAB_ANALYZERS: - MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang]) + MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang) analyzer = MECAB_ANALYZERS[lang] text = unicodedata.normalize('NFKC', text.strip())