From 94712c831208bce4f21945075db96994821ecb50 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Fri, 29 Jul 2016 17:27:15 -0400
Subject: [PATCH] Look for MeCab dictionaries in various places besides this
 package

Former-commit-id: afe65379948fbf2fc4603c2bc9e620a3d55c1957
---
 README.md         | 44 ++++++++++++++++++++++++++++++-------------
 wordfreq/mecab.py | 48 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index e473efa..c79a0f5 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
-Tools for working with word frequencies from various corpora.
+wordfreq is a Python library for looking up the frequencies of words in many
+languages, based on many sources of data.
 
 Author: Robyn Speer
 
@@ -15,31 +16,48 @@ or by getting the repository and running its setup.py:
 
     python3 setup.py install
 
-Japanese and Chinese have additional external dependencies so that they can be
+### Additional CJK setup
+
+Chinese, Japanese, and Korean have additional external dependencies so that they can be
 tokenized correctly.
 
-To be able to look up word frequencies in Japanese, you need to additionally
-install mecab-python3, which itself depends on libmecab-dev and its dictionary.
-These commands will install them on Ubuntu:
-
-    sudo apt-get install mecab-ipadic-utf8 libmecab-dev
-    pip3 install mecab-python3
-
 To be able to look up word frequencies in Chinese, you need Jieba, a
 pure-Python Chinese tokenizer:
 
     pip3 install jieba
 
-These dependencies can also be requested as options when installing wordfreq.
-For example:
+To be able to look up word frequencies in Japanese or Korean, you need to additionally
+install mecab-python3, which itself depends on libmecab-dev.
+These commands will install them on Ubuntu:
 
-    pip3 install wordfreq[mecab,jieba]
+    sudo apt-get install libmecab-dev
+    pip3 install mecab-python3
+
+If you installed wordfreq from Git, this should be all you need, because the
+dictionary files are included. Otherwise, read on.
+
+### Getting dictionary files for the PyPI version
+
+If you installed wordfreq from PyPI (for example, using pip), and you want to
+handle Japanese and Korean, you need to get their MeCab dictionary files
+separately. We would prefer to include them in the package, but PyPI has a size
+limit.
+
+The Japanese dictionary is called 'mecab-ipadic-utf8', and is available as an Ubuntu
+package by that name:
+
+    sudo apt-get install mecab-ipadic-utf8
+
+The Korean dictionary does not have an Ubuntu package. One option, besides getting it
+from wordfreq's Git repository, is to install it from source from:
+
+    https://bitbucket.org/eunjeon/mecab-ko-dic
 
 
 ## Usage
 
 wordfreq provides access to estimates of the frequency with which a word is
-used, in 18 languages (see *Supported languages* below).
+used, in 27 languages (see *Supported languages* below).
 
 It provides three kinds of pre-built wordlists:
 
diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py
index 32bc5ac..61c1fee 100644
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@@ -1,12 +1,56 @@
 from pkg_resources import resource_filename
 import MeCab
 import unicodedata
+import os
+
+
+def find_mecab_dictionary(names):
+    """
+    Find a MeCab dictionary with a given name. The dictionary might come as
+    part of this repository (if you got wordfreq from GitHub) or might have to
+    be installed separately (if you got wordfreq from PyPI).
+
+    We'd prefer to include MeCab in the repository all the time, but PyPI's
+    package size limits make that not an option.
+    """
+    suggested_pkg = names[0]
+    paths = [
+        resource_filename('wordfreq', 'data'),
+        os.path.expanduser('~/.local/lib/mecab/dic'),
+        '/var/lib/mecab/dic',
+        '/var/local/lib/mecab/dic',
+        '/usr/lib/mecab/dic',
+        '/usr/local/lib/mecab/dic',
+    ]
+    full_paths = [os.path.join(path, name) for path in paths for name in names]
+    for path in full_paths:
+        if os.path.exists(path):
+            return path
+
+    error_lines = [
+        "Couldn't find the MeCab dictionary named %r." % name,
+        "You should download or use your system's package manager to install",
+        "the %r package." % suggested_pkg,
+        "",
+        "We looked in the following locations:"
+    ] + ["\t%s" % path for path in full_paths]
+
+    raise OSError('\n'.join(error_lines))
+
+
+def make_mecab_analyzer(names):
+    """
+    Get a MeCab analyzer object, given a list of names the dictionary might
+    have.
+    """
+    filename = find_mecab_dictionary(names)
+    return MeCab.Tagger('-d %s' % filename)
 
 
 # Instantiate the MeCab analyzers for each language.
 MECAB_ANALYZERS = {
-    'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')),
-    'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic'))
+    'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'mecab-ja-ipadic', 'ipadic-utf8']),
+    'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic'])
 }