Merge pull request #44 from LuminosoInsight/mecab-loading-fix

Allow MeCab to work in Japanese or Korean without the other
2024-12-23 09:21:37 +00:00 · 2016-08-19 11:59:44 -04:00 · 2016-08-19 11:59:44 -04:00 · 976c8df0fd
commit 976c8df0fd
parent e4b32afa18 aa880bcd84
2 changed files with 11 additions and 6 deletions
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.5',
+    version='1.5.1',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -52,11 +52,13 @@ def make_mecab_analyzer(names):
    return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))


-# Instantiate the MeCab analyzers for each language.
-MECAB_ANALYZERS = {
-    'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'ipadic-utf8']),
-    'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic'])
+# Describe how to get the MeCab analyzers for each language.
+MECAB_DICTIONARY_NAMES = {
+    'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
+    'ko': ['mecab-ko-dic', 'ko-dic']
 }
+# The constructed analyzers will go in this dictionary.
+MECAB_ANALYZERS = {}


 def mecab_tokenize(text, lang):
@ -68,8 +70,11 @@ def mecab_tokenize(text, lang):
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
-    if lang not in MECAB_ANALYZERS:
+    if lang not in MECAB_DICTIONARY_NAMES:
        raise ValueError("Can't run MeCab on language %r" % lang)
+    if lang not in MECAB_ANALYZERS:
+        MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
+
    analyzer = MECAB_ANALYZERS[lang]
    text = unicodedata.normalize('NFKC', text.strip())
    analyzed = analyzer.parse(text)