mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Use Python packages to find dictionaries for MeCab
This commit is contained in:
parent
6b97d093b6
commit
2cc58d68ad
10
setup.py
10
setup.py
@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
|
||||
encoding='utf-8').read()
|
||||
doclines = README_contents.split("\n")
|
||||
dependencies = [
|
||||
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
|
||||
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
|
||||
]
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='2.4.1',
|
||||
version='2.5.0',
|
||||
maintainer='Robyn Speer',
|
||||
maintainer_email='rspeer@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
@ -55,8 +55,8 @@ setup(
|
||||
#
|
||||
# Similarly, jieba is required for Chinese word frequencies.
|
||||
extras_require={
|
||||
'mecab': 'mecab-python3',
|
||||
'jieba': 'jieba >= 0.42'
|
||||
'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
|
||||
'jieba': ['jieba >= 0.42']
|
||||
},
|
||||
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'],
|
||||
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
|
||||
)
|
||||
|
@ -8,56 +8,21 @@ import os
|
||||
MAX_PATH_LENGTH = 58
|
||||
|
||||
|
||||
def find_mecab_dictionary(names):
|
||||
def make_mecab_analyzer(lang):
|
||||
"""
|
||||
Find a MeCab dictionary with a given name. The dictionary has to be
|
||||
installed separately -- see wordfreq's README for instructions.
|
||||
Get a MeCab analyzer object, given the language code of the language to
|
||||
analyze.
|
||||
"""
|
||||
suggested_pkg = names[0]
|
||||
paths = [
|
||||
os.path.expanduser('~/.local/lib/mecab/dic'),
|
||||
'/var/lib/mecab/dic',
|
||||
'/var/local/lib/mecab/dic',
|
||||
'/usr/lib/mecab/dic',
|
||||
'/usr/local/lib/mecab/dic',
|
||||
'/usr/lib/x86_64-linux-gnu/mecab/dic',
|
||||
]
|
||||
full_paths = [os.path.join(path, name) for path in paths for name in names]
|
||||
checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
|
||||
for path in checked_paths:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
error_lines = [
|
||||
"Couldn't find the MeCab dictionary named %r." % suggested_pkg,
|
||||
"You should download or use your system's package manager to install",
|
||||
"the %r package." % suggested_pkg,
|
||||
"",
|
||||
"We looked in the following locations:"
|
||||
] + ["\t%s" % path for path in checked_paths]
|
||||
|
||||
skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
|
||||
if skipped_paths:
|
||||
error_lines += [
|
||||
"We had to skip these paths that are too long for MeCab to find:",
|
||||
] + ["\t%s" % path for path in skipped_paths]
|
||||
|
||||
raise OSError('\n'.join(error_lines))
|
||||
if lang == 'ko':
|
||||
import mecab_ko_dic
|
||||
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
|
||||
elif lang == 'ja':
|
||||
import ipadic
|
||||
return MeCab.Tagger(ipadic.MECAB_ARGS)
|
||||
else:
|
||||
raise ValueError("Can't run MeCab on language {lang}".format(lang))
|
||||
|
||||
|
||||
def make_mecab_analyzer(names):
|
||||
"""
|
||||
Get a MeCab analyzer object, given a list of names the dictionary might
|
||||
have.
|
||||
"""
|
||||
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
|
||||
|
||||
|
||||
# Describe how to get the MeCab analyzers for each language.
|
||||
MECAB_DICTIONARY_NAMES = {
|
||||
'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
|
||||
'ko': ['mecab-ko-dic', 'ko-dic']
|
||||
}
|
||||
# The constructed analyzers will go in this dictionary.
|
||||
MECAB_ANALYZERS = {}
|
||||
|
||||
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
|
||||
contains the same table that the command-line version of MeCab would output.
|
||||
We find the tokens in the first column of this table.
|
||||
"""
|
||||
if lang not in MECAB_DICTIONARY_NAMES:
|
||||
raise ValueError("Can't run MeCab on language %r" % lang)
|
||||
if lang not in MECAB_ANALYZERS:
|
||||
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
|
||||
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
|
||||
|
||||
analyzer = MECAB_ANALYZERS[lang]
|
||||
text = unicodedata.normalize('NFKC', text.strip())
|
||||
|
Loading…
Reference in New Issue
Block a user