Merge pull request #89 from LuminosoInsight/dependencies-and-tokens

Rework CJK dependencies and fix a tokenization bug
This commit is contained in:
Lance Nathan 2021-02-23 15:15:17 -05:00 committed by GitHub
commit 4c0b29f460
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 51 additions and 114 deletions

View File

@ -1,3 +1,14 @@
## Version 2.4.2 (2021-02-19)
- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to
be installed separately as system packages. They can now be found via the
Python packages `ipadic` and `mecab-ko-dic`.
- When the tokenizer had to infer word boundaries in languages without spaces,
inputs that were too long (such as the letter 'l' repeated 800 times) were
causing overflow errors. We changed the sequence of operations so that it
no longer overflows, and such inputs simply get a frequency of 0.
## Version 2.4.1 (2021-02-09)
- Changed a log message to not try to call a language by name, to remove

View File

@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example.
## Additional CJK installation
Chinese, Japanese, and Korean have additional external dependencies so that
they can be tokenized correctly. Here we'll explain how to set them up,
in increasing order of difficulty.
they can be tokenized correctly. They can all be installed at once by requesting
the 'cjk' feature:
pip install wordfreq[cjk]
### Chinese
Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends
on `mecab-python` and `ipadic`, and tokenizing Korean depends on `mecab-python`
and `mecab-ko-dic`.
To be able to look up word frequencies in Chinese, you need Jieba, a
pure-Python Chinese tokenizer:
pip3 install jieba
### Japanese
We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three
things need to be installed:
* The MeCab development library (called `libmecab-dev` on Ubuntu)
* The UTF-8 version of the `ipadic` Japanese dictionary
(called `mecab-ipadic-utf8` on Ubuntu)
* The `mecab-python3` Python interface
To install these three things on Ubuntu, you can run:
```sh
sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8
pip3 install mecab-python3
```
If you choose to install `ipadic` from somewhere else or from its source code,
be sure it's configured to use UTF-8. By default it will use EUC-JP, which will
give you nonsense results.
### Korean
Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and
Yungho Yu. This dictionary is not available as an Ubuntu package.
Here's a process you can use to install the Korean dictionary and the other
MeCab dependencies:
```sh
sudo apt-get install libmecab-dev mecab-utils
pip3 install mecab-python3
wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz
cd mecab-ko-dic-2.0.1-20150920
./autogen.sh
./configure
make
sudo make install
```
If wordfreq cannot find the Japanese or Korean data for MeCab when asked to
tokenize those languages, it will raise an error and show you the list of
paths it searched.
Sorry that this is difficult. We tried to just package the data files we need
with wordfreq, like we do for Chinese, but PyPI would reject the package for
being too large.
As of version 2.4.2, you no longer have to install dictionaries separately.
## License

View File

@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read()
doclines = README_contents.split("\n")
dependencies = [
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
]
setup(
name="wordfreq",
version='2.4.1',
version='2.4.2',
maintainer='Robyn Speer',
maintainer_email='rspeer@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
@ -55,8 +55,12 @@ setup(
#
# Similarly, jieba is required for Chinese word frequencies.
extras_require={
'mecab': 'mecab-python3',
'jieba': 'jieba >= 0.42'
# previous names for extras
'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
'jieba': ['jieba >= 0.42'],
# get them all at once
'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42']
},
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'],
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
)

View File

@ -1,4 +1,4 @@
from wordfreq import tokenize, word_frequency
from wordfreq import tokenize, word_frequency, zipf_frequency
import pytest
@ -77,3 +77,13 @@ def test_alternate_codes():
# Separate codes for Mandarin and Cantonese
assert tokenize('谢谢谢谢', 'cmn') == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens
def test_unreasonably_long():
# This crashed earlier versions of wordfreq due to an overflow in
# exponentiation. We've now changed the sequence of operations so it
# will underflow instead.
lots_of_ls = 'l' * 800
assert word_frequency(lots_of_ls, 'zh') == 0.
assert zipf_frequency(lots_of_ls, 'zh') == 0.

View File

@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
# If we used the Jieba tokenizer, we could tokenize anything to match
# our wordlist, even nonsense. To counteract this, we multiply by a
# probability for each word break that was inferred.
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
# All our frequency data is only precise to within 1% anyway, so round
# it to 3 significant digits

View File

@ -8,56 +8,21 @@ import os
MAX_PATH_LENGTH = 58
def find_mecab_dictionary(names):
def make_mecab_analyzer(lang):
"""
Find a MeCab dictionary with a given name. The dictionary has to be
installed separately -- see wordfreq's README for instructions.
Get a MeCab analyzer object, given the language code of the language to
analyze.
"""
suggested_pkg = names[0]
paths = [
os.path.expanduser('~/.local/lib/mecab/dic'),
'/var/lib/mecab/dic',
'/var/local/lib/mecab/dic',
'/usr/lib/mecab/dic',
'/usr/local/lib/mecab/dic',
'/usr/lib/x86_64-linux-gnu/mecab/dic',
]
full_paths = [os.path.join(path, name) for path in paths for name in names]
checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
for path in checked_paths:
if os.path.exists(path):
return path
error_lines = [
"Couldn't find the MeCab dictionary named %r." % suggested_pkg,
"You should download or use your system's package manager to install",
"the %r package." % suggested_pkg,
"",
"We looked in the following locations:"
] + ["\t%s" % path for path in checked_paths]
skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
if skipped_paths:
error_lines += [
"We had to skip these paths that are too long for MeCab to find:",
] + ["\t%s" % path for path in skipped_paths]
raise OSError('\n'.join(error_lines))
if lang == 'ko':
import mecab_ko_dic
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
elif lang == 'ja':
import ipadic
return MeCab.Tagger(ipadic.MECAB_ARGS)
else:
raise ValueError("Can't run MeCab on language {lang}".format(lang))
def make_mecab_analyzer(names):
"""
Get a MeCab analyzer object, given a list of names the dictionary might
have.
"""
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
# Describe how to get the MeCab analyzers for each language.
MECAB_DICTIONARY_NAMES = {
'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
'ko': ['mecab-ko-dic', 'ko-dic']
}
# The constructed analyzers will go in this dictionary.
MECAB_ANALYZERS = {}
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table.
"""
if lang not in MECAB_DICTIONARY_NAMES:
raise ValueError("Can't run MeCab on language %r" % lang)
if lang not in MECAB_ANALYZERS:
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
analyzer = MECAB_ANALYZERS[lang]
text = unicodedata.normalize('NFKC', text.strip())