Merge branch 'master' into data-update-2.5

This commit is contained in:
Robyn Speer 2021-03-29 16:42:24 -04:00
commit 00e60df106
4 changed files with 40 additions and 114 deletions

View File

@ -1,3 +1,14 @@
## Version 2.4.2 (2021-02-19)
- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to
be installed separately as system packages. They can now be found via the
Python packages `ipadic` and `mecab-ko-dic`.
- When the tokenizer had to infer word boundaries in languages without spaces,
inputs that were too long (such as the letter 'l' repeated 800 times) were
causing overflow errors. We changed the sequence of operations so that it
no longer overflows, and such inputs simply get a frequency of 0.
## Version 2.4.1 (2021-02-09) ## Version 2.4.1 (2021-02-09)
- Changed a log message to not try to call a language by name, to remove - Changed a log message to not try to call a language by name, to remove

View File

@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example.
## Additional CJK installation ## Additional CJK installation
Chinese, Japanese, and Korean have additional external dependencies so that Chinese, Japanese, and Korean have additional external dependencies so that
they can be tokenized correctly. Here we'll explain how to set them up, they can be tokenized correctly. They can all be installed at once by requesting
in increasing order of difficulty. the 'cjk' feature:
pip install wordfreq[cjk]
### Chinese Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends
on `mecab-python3` and `ipadic`, and tokenizing Korean depends on `mecab-python3`
and `mecab-ko-dic`.
To be able to look up word frequencies in Chinese, you need Jieba, a As of version 2.4.2, you no longer have to install dictionaries separately.
pure-Python Chinese tokenizer:
pip3 install jieba
### Japanese
We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three
things need to be installed:
* The MeCab development library (called `libmecab-dev` on Ubuntu)
* The UTF-8 version of the `ipadic` Japanese dictionary
(called `mecab-ipadic-utf8` on Ubuntu)
* The `mecab-python3` Python interface
To install these three things on Ubuntu, you can run:
```sh
sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8
pip3 install mecab-python3
```
If you choose to install `ipadic` from somewhere else or from its source code,
be sure it's configured to use UTF-8. By default it will use EUC-JP, which will
give you nonsense results.
### Korean
Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and
Yungho Yu. This dictionary is not available as an Ubuntu package.
Here's a process you can use to install the Korean dictionary and the other
MeCab dependencies:
```sh
sudo apt-get install libmecab-dev mecab-utils
pip3 install mecab-python3
wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz
cd mecab-ko-dic-2.0.1-20150920
./autogen.sh
./configure
make
sudo make install
```
If wordfreq cannot find the Japanese or Korean data for MeCab when asked to
tokenize those languages, it will raise an error and show you the list of
paths it searched.
Sorry that this is difficult. We tried to just package the data files we need
with wordfreq, like we do for Chinese, but PyPI would reject the package for
being too large.
## License ## License

View File

@ -28,7 +28,7 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = [ dependencies = [
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04' 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
] ]
setup( setup(
@ -49,14 +49,17 @@ setup(
install_requires=dependencies, install_requires=dependencies,
# mecab-python3 is required for looking up Japanese or Korean word # mecab-python3 is required for looking up Japanese or Korean word
# frequencies. In turn, it depends on libmecab-dev being installed on the # frequencies. It's not listed under 'install_requires' because wordfreq
# system. It's not listed under 'install_requires' because wordfreq should # should be usable in other languages without it.
# be usable in other languages without it.
# #
# Similarly, jieba is required for Chinese word frequencies. # Similarly, jieba is required for Chinese word frequencies.
extras_require={ extras_require={
'mecab': 'mecab-python3', # previous names for extras
'jieba': 'jieba >= 0.42' 'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
'jieba': ['jieba >= 0.42'],
# get them all at once
'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42']
}, },
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'], tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
) )

View File

@ -8,56 +8,21 @@ import os
MAX_PATH_LENGTH = 58 MAX_PATH_LENGTH = 58
def find_mecab_dictionary(names): def make_mecab_analyzer(lang):
""" """
Find a MeCab dictionary with a given name. The dictionary has to be Get a MeCab analyzer object, given the language code of the language to
installed separately -- see wordfreq's README for instructions. analyze.
""" """
suggested_pkg = names[0] if lang == 'ko':
paths = [ import mecab_ko_dic
os.path.expanduser('~/.local/lib/mecab/dic'), return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
'/var/lib/mecab/dic', elif lang == 'ja':
'/var/local/lib/mecab/dic', import ipadic
'/usr/lib/mecab/dic', return MeCab.Tagger(ipadic.MECAB_ARGS)
'/usr/local/lib/mecab/dic', else:
'/usr/lib/x86_64-linux-gnu/mecab/dic', raise ValueError("Can't run MeCab on language {lang}".format(lang))
]
full_paths = [os.path.join(path, name) for path in paths for name in names]
checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
for path in checked_paths:
if os.path.exists(path):
return path
error_lines = [
"Couldn't find the MeCab dictionary named %r." % suggested_pkg,
"You should download or use your system's package manager to install",
"the %r package." % suggested_pkg,
"",
"We looked in the following locations:"
] + ["\t%s" % path for path in checked_paths]
skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
if skipped_paths:
error_lines += [
"We had to skip these paths that are too long for MeCab to find:",
] + ["\t%s" % path for path in skipped_paths]
raise OSError('\n'.join(error_lines))
def make_mecab_analyzer(names):
"""
Get a MeCab analyzer object, given a list of names the dictionary might
have.
"""
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
# Describe how to get the MeCab analyzers for each language.
MECAB_DICTIONARY_NAMES = {
'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
'ko': ['mecab-ko-dic', 'ko-dic']
}
# The constructed analyzers will go in this dictionary. # The constructed analyzers will go in this dictionary.
MECAB_ANALYZERS = {} MECAB_ANALYZERS = {}
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
contains the same table that the command-line version of MeCab would output. contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table. We find the tokens in the first column of this table.
""" """
if lang not in MECAB_DICTIONARY_NAMES:
raise ValueError("Can't run MeCab on language %r" % lang)
if lang not in MECAB_ANALYZERS: if lang not in MECAB_ANALYZERS:
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang]) MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
analyzer = MECAB_ANALYZERS[lang] analyzer = MECAB_ANALYZERS[lang]
text = unicodedata.normalize('NFKC', text.strip()) text = unicodedata.normalize('NFKC', text.strip())