Use Python packages to find dictionaries for MeCab

2024-12-23 09:21:37 +00:00 · 2021-02-18 18:18:06 -05:00 · 2021-02-18 18:18:06 -05:00 · de636a804e
commit de636a804e
parent ed23bf3ebe
2 changed files with 17 additions and 54 deletions
--- a/setup.py
+++ b/setup.py
@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
                       encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = [
-    'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
+    'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
 ]
 setup(
    name="wordfreq",
-    version='2.4.1',
+    version='2.5.0',
    maintainer='Robyn Speer',
    maintainer_email='rspeer@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -55,8 +55,8 @@ setup(
    #
    # Similarly, jieba is required for Chinese word frequencies.
    extras_require={
-        'mecab': 'mecab-python3',
+        'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
-        'jieba': 'jieba >= 0.42'
+        'jieba': ['jieba >= 0.42']
    },
-    tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'],
+    tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
 )
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -8,56 +8,21 @@ import os
 MAX_PATH_LENGTH = 58
-def find_mecab_dictionary(names):
+def make_mecab_analyzer(lang):
    """
-    Find a MeCab dictionary with a given name. The dictionary has to be
+    Get a MeCab analyzer object, given the language code of the language to
-    installed separately -- see wordfreq's README for instructions.
+    analyze.
    """
-    suggested_pkg = names[0]
+    if lang == 'ko':
-    paths = [
+        import mecab_ko_dic
-        os.path.expanduser('~/.local/lib/mecab/dic'),
+        return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
-        '/var/lib/mecab/dic',
+    elif lang == 'ja':
-        '/var/local/lib/mecab/dic',
+        import ipadic
-        '/usr/lib/mecab/dic',
+        return MeCab.Tagger(ipadic.MECAB_ARGS)
-        '/usr/local/lib/mecab/dic',
+    else:
-        '/usr/lib/x86_64-linux-gnu/mecab/dic',
+        raise ValueError("Can't run MeCab on language {lang}".format(lang))
    ]
    full_paths = [os.path.join(path, name) for path in paths for name in names]
    checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
    for path in checked_paths:
        if os.path.exists(path):
            return path
    error_lines = [
        "Couldn't find the MeCab dictionary named %r." % suggested_pkg,
        "You should download or use your system's package manager to install",
        "the %r package." % suggested_pkg,
        "",
        "We looked in the following locations:"
    ] + ["\t%s" % path for path in checked_paths]
    skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
    if skipped_paths:
        error_lines += [
            "We had to skip these paths that are too long for MeCab to find:",
        ] + ["\t%s" % path for path in skipped_paths]
    raise OSError('\n'.join(error_lines))
 def make_mecab_analyzer(names):
    """
    Get a MeCab analyzer object, given a list of names the dictionary might
    have.
    """
    return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
 # Describe how to get the MeCab analyzers for each language.
 MECAB_DICTIONARY_NAMES = {
    'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
    'ko': ['mecab-ko-dic', 'ko-dic']
 }
 # The constructed analyzers will go in this dictionary.
 MECAB_ANALYZERS = {}
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
    if lang not in MECAB_DICTIONARY_NAMES:
        raise ValueError("Can't run MeCab on language %r" % lang)
    if lang not in MECAB_ANALYZERS:
-        MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
+        MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
    analyzer = MECAB_ANALYZERS[lang]
    text = unicodedata.normalize('NFKC', text.strip())