mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Look for MeCab dictionaries in various places besides this package
Former-commit-id: afe6537994
This commit is contained in:
parent
ce5a91d732
commit
94712c8312
44
README.md
44
README.md
@ -1,4 +1,5 @@
|
||||
Tools for working with word frequencies from various corpora.
|
||||
wordfreq is a Python library for looking up the frequencies of words in many
|
||||
languages, based on many sources of data.
|
||||
|
||||
Author: Robyn Speer
|
||||
|
||||
@ -15,31 +16,48 @@ or by getting the repository and running its setup.py:
|
||||
|
||||
python3 setup.py install
|
||||
|
||||
Japanese and Chinese have additional external dependencies so that they can be
|
||||
### Additional CJK setup
|
||||
|
||||
Chinese, Japanese, and Korean have additional external dependencies so that they can be
|
||||
tokenized correctly.
|
||||
|
||||
To be able to look up word frequencies in Japanese, you need to additionally
|
||||
install mecab-python3, which itself depends on libmecab-dev and its dictionary.
|
||||
These commands will install them on Ubuntu:
|
||||
|
||||
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
|
||||
pip3 install mecab-python3
|
||||
|
||||
To be able to look up word frequencies in Chinese, you need Jieba, a
|
||||
pure-Python Chinese tokenizer:
|
||||
|
||||
pip3 install jieba
|
||||
|
||||
These dependencies can also be requested as options when installing wordfreq.
|
||||
For example:
|
||||
To be able to look up word frequencies in Japanese or Korean, you need to additionally
|
||||
install mecab-python3, which itself depends on libmecab-dev.
|
||||
These commands will install them on Ubuntu:
|
||||
|
||||
pip3 install wordfreq[mecab,jieba]
|
||||
sudo apt-get install libmecab-dev
|
||||
pip3 install mecab-python3
|
||||
|
||||
If you installed wordfreq from Git, this should be all you need, because the
|
||||
dictionary files are included. Otherwise, read on.
|
||||
|
||||
### Getting dictionary files for the PyPI version
|
||||
|
||||
If you installed wordfreq from PyPI (for example, using pip), and you want to
|
||||
handle Japanese and Korean, you need to get their MeCab dictionary files
|
||||
separately. We would prefer to include them in the package, but PyPI has a size
|
||||
limit.
|
||||
|
||||
The Japanese dictionary is called 'mecab-ipadic-utf8', and is available as an Ubuntu
|
||||
package by that name:
|
||||
|
||||
sudo apt-get install mecab-ipadic-utf8
|
||||
|
||||
The Korean dictionary does not have an Ubuntu package. One option, besides getting it
|
||||
from wordfreq's Git repository, is to install it from source from:
|
||||
|
||||
https://bitbucket.org/eunjeon/mecab-ko-dic
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
wordfreq provides access to estimates of the frequency with which a word is
|
||||
used, in 18 languages (see *Supported languages* below).
|
||||
used, in 27 languages (see *Supported languages* below).
|
||||
|
||||
It provides three kinds of pre-built wordlists:
|
||||
|
||||
|
@ -1,12 +1,56 @@
|
||||
from pkg_resources import resource_filename
|
||||
import MeCab
|
||||
import unicodedata
|
||||
import os
|
||||
|
||||
|
||||
def find_mecab_dictionary(names):
|
||||
"""
|
||||
Find a MeCab dictionary with a given name. The dictionary might come as
|
||||
part of this repository (if you got wordfreq from GitHub) or might have to
|
||||
be installed separately (if you got wordfreq from PyPI).
|
||||
|
||||
We'd prefer to include MeCab in the repository all the time, but PyPI's
|
||||
package size limits make that not an option.
|
||||
"""
|
||||
suggested_pkg = names[0]
|
||||
paths = [
|
||||
resource_filename('wordfreq', 'data'),
|
||||
os.path.expanduser('~/.local/lib/mecab/dic'),
|
||||
'/var/lib/mecab/dic',
|
||||
'/var/local/lib/mecab/dic',
|
||||
'/usr/lib/mecab/dic',
|
||||
'/usr/local/lib/mecab/dic',
|
||||
]
|
||||
full_paths = [os.path.join(path, name) for path in paths for name in names]
|
||||
for path in full_paths:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
error_lines = [
|
||||
"Couldn't find the MeCab dictionary named %r." % name,
|
||||
"You should download or use your system's package manager to install",
|
||||
"the %r package." % suggested_pkg,
|
||||
"",
|
||||
"We looked in the following locations:"
|
||||
] + ["\t%s" % path for path in full_paths]
|
||||
|
||||
raise OSError('\n'.join(error_lines))
|
||||
|
||||
|
||||
def make_mecab_analyzer(names):
|
||||
"""
|
||||
Get a MeCab analyzer object, given a list of names the dictionary might
|
||||
have.
|
||||
"""
|
||||
filename = find_mecab_dictionary(names)
|
||||
return MeCab.Tagger('-d %s' % filename)
|
||||
|
||||
|
||||
# Instantiate the MeCab analyzers for each language.
|
||||
MECAB_ANALYZERS = {
|
||||
'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')),
|
||||
'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic'))
|
||||
'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'mecab-ja-ipadic', 'ipadic-utf8']),
|
||||
'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic'])
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user