mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Look for MeCab dictionaries in various places besides this package
Former-commit-id: afe6537994
This commit is contained in:
parent
ce5a91d732
commit
94712c8312
44
README.md
44
README.md
@ -1,4 +1,5 @@
|
|||||||
Tools for working with word frequencies from various corpora.
|
wordfreq is a Python library for looking up the frequencies of words in many
|
||||||
|
languages, based on many sources of data.
|
||||||
|
|
||||||
Author: Robyn Speer
|
Author: Robyn Speer
|
||||||
|
|
||||||
@ -15,31 +16,48 @@ or by getting the repository and running its setup.py:
|
|||||||
|
|
||||||
python3 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
Japanese and Chinese have additional external dependencies so that they can be
|
### Additional CJK setup
|
||||||
|
|
||||||
|
Chinese, Japanese, and Korean have additional external dependencies so that they can be
|
||||||
tokenized correctly.
|
tokenized correctly.
|
||||||
|
|
||||||
To be able to look up word frequencies in Japanese, you need to additionally
|
|
||||||
install mecab-python3, which itself depends on libmecab-dev and its dictionary.
|
|
||||||
These commands will install them on Ubuntu:
|
|
||||||
|
|
||||||
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
|
|
||||||
pip3 install mecab-python3
|
|
||||||
|
|
||||||
To be able to look up word frequencies in Chinese, you need Jieba, a
|
To be able to look up word frequencies in Chinese, you need Jieba, a
|
||||||
pure-Python Chinese tokenizer:
|
pure-Python Chinese tokenizer:
|
||||||
|
|
||||||
pip3 install jieba
|
pip3 install jieba
|
||||||
|
|
||||||
These dependencies can also be requested as options when installing wordfreq.
|
To be able to look up word frequencies in Japanese or Korean, you need to additionally
|
||||||
For example:
|
install mecab-python3, which itself depends on libmecab-dev.
|
||||||
|
These commands will install them on Ubuntu:
|
||||||
|
|
||||||
pip3 install wordfreq[mecab,jieba]
|
sudo apt-get install libmecab-dev
|
||||||
|
pip3 install mecab-python3
|
||||||
|
|
||||||
|
If you installed wordfreq from Git, this should be all you need, because the
|
||||||
|
dictionary files are included. Otherwise, read on.
|
||||||
|
|
||||||
|
### Getting dictionary files for the PyPI version
|
||||||
|
|
||||||
|
If you installed wordfreq from PyPI (for example, using pip), and you want to
|
||||||
|
handle Japanese and Korean, you need to get their MeCab dictionary files
|
||||||
|
separately. We would prefer to include them in the package, but PyPI has a size
|
||||||
|
limit.
|
||||||
|
|
||||||
|
The Japanese dictionary is called 'mecab-ipadic-utf8', and is available as an Ubuntu
|
||||||
|
package by that name:
|
||||||
|
|
||||||
|
sudo apt-get install mecab-ipadic-utf8
|
||||||
|
|
||||||
|
The Korean dictionary does not have an Ubuntu package. One option, besides getting it
|
||||||
|
from wordfreq's Git repository, is to install it from source from:
|
||||||
|
|
||||||
|
https://bitbucket.org/eunjeon/mecab-ko-dic
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
wordfreq provides access to estimates of the frequency with which a word is
|
wordfreq provides access to estimates of the frequency with which a word is
|
||||||
used, in 18 languages (see *Supported languages* below).
|
used, in 27 languages (see *Supported languages* below).
|
||||||
|
|
||||||
It provides three kinds of pre-built wordlists:
|
It provides three kinds of pre-built wordlists:
|
||||||
|
|
||||||
|
@ -1,12 +1,56 @@
|
|||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
import MeCab
|
import MeCab
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def find_mecab_dictionary(names):
|
||||||
|
"""
|
||||||
|
Find a MeCab dictionary with a given name. The dictionary might come as
|
||||||
|
part of this repository (if you got wordfreq from GitHub) or might have to
|
||||||
|
be installed separately (if you got wordfreq from PyPI).
|
||||||
|
|
||||||
|
We'd prefer to include MeCab in the repository all the time, but PyPI's
|
||||||
|
package size limits make that not an option.
|
||||||
|
"""
|
||||||
|
suggested_pkg = names[0]
|
||||||
|
paths = [
|
||||||
|
resource_filename('wordfreq', 'data'),
|
||||||
|
os.path.expanduser('~/.local/lib/mecab/dic'),
|
||||||
|
'/var/lib/mecab/dic',
|
||||||
|
'/var/local/lib/mecab/dic',
|
||||||
|
'/usr/lib/mecab/dic',
|
||||||
|
'/usr/local/lib/mecab/dic',
|
||||||
|
]
|
||||||
|
full_paths = [os.path.join(path, name) for path in paths for name in names]
|
||||||
|
for path in full_paths:
|
||||||
|
if os.path.exists(path):
|
||||||
|
return path
|
||||||
|
|
||||||
|
error_lines = [
|
||||||
|
"Couldn't find the MeCab dictionary named %r." % name,
|
||||||
|
"You should download or use your system's package manager to install",
|
||||||
|
"the %r package." % suggested_pkg,
|
||||||
|
"",
|
||||||
|
"We looked in the following locations:"
|
||||||
|
] + ["\t%s" % path for path in full_paths]
|
||||||
|
|
||||||
|
raise OSError('\n'.join(error_lines))
|
||||||
|
|
||||||
|
|
||||||
|
def make_mecab_analyzer(names):
|
||||||
|
"""
|
||||||
|
Get a MeCab analyzer object, given a list of names the dictionary might
|
||||||
|
have.
|
||||||
|
"""
|
||||||
|
filename = find_mecab_dictionary(names)
|
||||||
|
return MeCab.Tagger('-d %s' % filename)
|
||||||
|
|
||||||
|
|
||||||
# Instantiate the MeCab analyzers for each language.
|
# Instantiate the MeCab analyzers for each language.
|
||||||
MECAB_ANALYZERS = {
|
MECAB_ANALYZERS = {
|
||||||
'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')),
|
'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'mecab-ja-ipadic', 'ipadic-utf8']),
|
||||||
'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic'))
|
'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic'])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user