From 94712c831208bce4f21945075db96994821ecb50 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Fri, 29 Jul 2016 17:27:15 -0400 Subject: [PATCH] Look for MeCab dictionaries in various places besides this package Former-commit-id: afe65379948fbf2fc4603c2bc9e620a3d55c1957 --- README.md | 44 ++++++++++++++++++++++++++++++------------- wordfreq/mecab.py | 48 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e473efa..c79a0f5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -Tools for working with word frequencies from various corpora. +wordfreq is a Python library for looking up the frequencies of words in many +languages, based on many sources of data. Author: Robyn Speer @@ -15,31 +16,48 @@ or by getting the repository and running its setup.py: python3 setup.py install -Japanese and Chinese have additional external dependencies so that they can be +### Additional CJK setup + +Chinese, Japanese, and Korean have additional external dependencies so that they can be tokenized correctly. -To be able to look up word frequencies in Japanese, you need to additionally -install mecab-python3, which itself depends on libmecab-dev and its dictionary. -These commands will install them on Ubuntu: - - sudo apt-get install mecab-ipadic-utf8 libmecab-dev - pip3 install mecab-python3 - To be able to look up word frequencies in Chinese, you need Jieba, a pure-Python Chinese tokenizer: pip3 install jieba -These dependencies can also be requested as options when installing wordfreq. -For example: +To be able to look up word frequencies in Japanese or Korean, you need to additionally +install mecab-python3, which itself depends on libmecab-dev. +These commands will install them on Ubuntu: - pip3 install wordfreq[mecab,jieba] + sudo apt-get install libmecab-dev + pip3 install mecab-python3 + +If you installed wordfreq from Git, this should be all you need, because the +dictionary files are included. Otherwise, read on. + +### Getting dictionary files for the PyPI version + +If you installed wordfreq from PyPI (for example, using pip), and you want to +handle Japanese and Korean, you need to get their MeCab dictionary files +separately. We would prefer to include them in the package, but PyPI has a size +limit. + +The Japanese dictionary is called 'mecab-ipadic-utf8', and is available as an Ubuntu +package by that name: + + sudo apt-get install mecab-ipadic-utf8 + +The Korean dictionary does not have an Ubuntu package. One option, besides getting it +from wordfreq's Git repository, is to install it from source from: + + https://bitbucket.org/eunjeon/mecab-ko-dic ## Usage wordfreq provides access to estimates of the frequency with which a word is -used, in 18 languages (see *Supported languages* below). +used, in 27 languages (see *Supported languages* below). It provides three kinds of pre-built wordlists: diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index 32bc5ac..61c1fee 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -1,12 +1,56 @@ from pkg_resources import resource_filename import MeCab import unicodedata +import os + + +def find_mecab_dictionary(names): + """ + Find a MeCab dictionary with a given name. The dictionary might come as + part of this repository (if you got wordfreq from GitHub) or might have to + be installed separately (if you got wordfreq from PyPI). + + We'd prefer to include MeCab in the repository all the time, but PyPI's + package size limits make that not an option. + """ + suggested_pkg = names[0] + paths = [ + resource_filename('wordfreq', 'data'), + os.path.expanduser('~/.local/lib/mecab/dic'), + '/var/lib/mecab/dic', + '/var/local/lib/mecab/dic', + '/usr/lib/mecab/dic', + '/usr/local/lib/mecab/dic', + ] + full_paths = [os.path.join(path, name) for path in paths for name in names] + for path in full_paths: + if os.path.exists(path): + return path + + error_lines = [ + "Couldn't find the MeCab dictionary named %r." % name, + "You should download or use your system's package manager to install", + "the %r package." % suggested_pkg, + "", + "We looked in the following locations:" + ] + ["\t%s" % path for path in full_paths] + + raise OSError('\n'.join(error_lines)) + + +def make_mecab_analyzer(names): + """ + Get a MeCab analyzer object, given a list of names the dictionary might + have. + """ + filename = find_mecab_dictionary(names) + return MeCab.Tagger('-d %s' % filename) # Instantiate the MeCab analyzers for each language. MECAB_ANALYZERS = { - 'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')), - 'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic')) + 'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'mecab-ja-ipadic', 'ipadic-utf8']), + 'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic']) }