mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #89 from LuminosoInsight/dependencies-and-tokens
Rework CJK dependencies and fix a tokenization bug
This commit is contained in:
commit
32093d9efc
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
|||||||
|
## Version 2.4.2 (2021-02-19)
|
||||||
|
|
||||||
|
- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to
|
||||||
|
be installed separately as system packages. They can now be found via the
|
||||||
|
Python packages `ipadic` and `mecab-ko-dic`.
|
||||||
|
|
||||||
|
- When the tokenizer had to infer word boundaries in languages without spaces,
|
||||||
|
inputs that were too long (such as the letter 'l' repeated 800 times) were
|
||||||
|
causing overflow errors. We changed the sequence of operations so that it
|
||||||
|
no longer overflows, and such inputs simply get a frequency of 0.
|
||||||
|
|
||||||
## Version 2.4.1 (2021-02-09)
|
## Version 2.4.1 (2021-02-09)
|
||||||
|
|
||||||
- Changed a log message to not try to call a language by name, to remove
|
- Changed a log message to not try to call a language by name, to remove
|
||||||
|
65
README.md
65
README.md
@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example.
|
|||||||
## Additional CJK installation
|
## Additional CJK installation
|
||||||
|
|
||||||
Chinese, Japanese, and Korean have additional external dependencies so that
|
Chinese, Japanese, and Korean have additional external dependencies so that
|
||||||
they can be tokenized correctly. Here we'll explain how to set them up,
|
they can be tokenized correctly. They can all be installed at once by requesting
|
||||||
in increasing order of difficulty.
|
the 'cjk' feature:
|
||||||
|
|
||||||
|
pip install wordfreq[cjk]
|
||||||
|
|
||||||
### Chinese
|
Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends
|
||||||
|
on `mecab-python` and `ipadic`, and tokenizing Korean depends on `mecab-python`
|
||||||
|
and `mecab-ko-dic`.
|
||||||
|
|
||||||
To be able to look up word frequencies in Chinese, you need Jieba, a
|
As of version 2.4.2, you no longer have to install dictionaries separately.
|
||||||
pure-Python Chinese tokenizer:
|
|
||||||
|
|
||||||
pip3 install jieba
|
|
||||||
|
|
||||||
|
|
||||||
### Japanese
|
|
||||||
|
|
||||||
We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three
|
|
||||||
things need to be installed:
|
|
||||||
|
|
||||||
* The MeCab development library (called `libmecab-dev` on Ubuntu)
|
|
||||||
* The UTF-8 version of the `ipadic` Japanese dictionary
|
|
||||||
(called `mecab-ipadic-utf8` on Ubuntu)
|
|
||||||
* The `mecab-python3` Python interface
|
|
||||||
|
|
||||||
To install these three things on Ubuntu, you can run:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8
|
|
||||||
pip3 install mecab-python3
|
|
||||||
```
|
|
||||||
|
|
||||||
If you choose to install `ipadic` from somewhere else or from its source code,
|
|
||||||
be sure it's configured to use UTF-8. By default it will use EUC-JP, which will
|
|
||||||
give you nonsense results.
|
|
||||||
|
|
||||||
|
|
||||||
### Korean
|
|
||||||
|
|
||||||
Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and
|
|
||||||
Yungho Yu. This dictionary is not available as an Ubuntu package.
|
|
||||||
|
|
||||||
Here's a process you can use to install the Korean dictionary and the other
|
|
||||||
MeCab dependencies:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
sudo apt-get install libmecab-dev mecab-utils
|
|
||||||
pip3 install mecab-python3
|
|
||||||
wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
|
|
||||||
tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz
|
|
||||||
cd mecab-ko-dic-2.0.1-20150920
|
|
||||||
./autogen.sh
|
|
||||||
./configure
|
|
||||||
make
|
|
||||||
sudo make install
|
|
||||||
```
|
|
||||||
|
|
||||||
If wordfreq cannot find the Japanese or Korean data for MeCab when asked to
|
|
||||||
tokenize those languages, it will raise an error and show you the list of
|
|
||||||
paths it searched.
|
|
||||||
|
|
||||||
Sorry that this is difficult. We tried to just package the data files we need
|
|
||||||
with wordfreq, like we do for Chinese, but PyPI would reject the package for
|
|
||||||
being too large.
|
|
||||||
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
14
setup.py
14
setup.py
@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
|
|||||||
encoding='utf-8').read()
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = [
|
dependencies = [
|
||||||
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
|
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
|
||||||
]
|
]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.4.1',
|
version='2.4.2',
|
||||||
maintainer='Robyn Speer',
|
maintainer='Robyn Speer',
|
||||||
maintainer_email='rspeer@luminoso.com',
|
maintainer_email='rspeer@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
@ -55,8 +55,12 @@ setup(
|
|||||||
#
|
#
|
||||||
# Similarly, jieba is required for Chinese word frequencies.
|
# Similarly, jieba is required for Chinese word frequencies.
|
||||||
extras_require={
|
extras_require={
|
||||||
'mecab': 'mecab-python3',
|
# previous names for extras
|
||||||
'jieba': 'jieba >= 0.42'
|
'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
|
||||||
|
'jieba': ['jieba >= 0.42'],
|
||||||
|
|
||||||
|
# get them all at once
|
||||||
|
'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42']
|
||||||
},
|
},
|
||||||
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'],
|
tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq import tokenize, word_frequency
|
from wordfreq import tokenize, word_frequency, zipf_frequency
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@ -77,3 +77,13 @@ def test_alternate_codes():
|
|||||||
# Separate codes for Mandarin and Cantonese
|
# Separate codes for Mandarin and Cantonese
|
||||||
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
||||||
assert tokenize('谢谢谢谢', 'yue') == tokens
|
assert tokenize('谢谢谢谢', 'yue') == tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_unreasonably_long():
|
||||||
|
# This crashed earlier versions of wordfreq due to an overflow in
|
||||||
|
# exponentiation. We've now changed the sequence of operations so it
|
||||||
|
# will underflow instead.
|
||||||
|
lots_of_ls = 'l' * 800
|
||||||
|
assert word_frequency(lots_of_ls, 'zh') == 0.
|
||||||
|
assert zipf_frequency(lots_of_ls, 'zh') == 0.
|
||||||
|
|
||||||
|
@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
# If we used the Jieba tokenizer, we could tokenize anything to match
|
# If we used the Jieba tokenizer, we could tokenize anything to match
|
||||||
# our wordlist, even nonsense. To counteract this, we multiply by a
|
# our wordlist, even nonsense. To counteract this, we multiply by a
|
||||||
# probability for each word break that was inferred.
|
# probability for each word break that was inferred.
|
||||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
|
||||||
|
|
||||||
# All our frequency data is only precise to within 1% anyway, so round
|
# All our frequency data is only precise to within 1% anyway, so round
|
||||||
# it to 3 significant digits
|
# it to 3 significant digits
|
||||||
|
@ -8,56 +8,21 @@ import os
|
|||||||
MAX_PATH_LENGTH = 58
|
MAX_PATH_LENGTH = 58
|
||||||
|
|
||||||
|
|
||||||
def find_mecab_dictionary(names):
|
def make_mecab_analyzer(lang):
|
||||||
"""
|
"""
|
||||||
Find a MeCab dictionary with a given name. The dictionary has to be
|
Get a MeCab analyzer object, given the language code of the language to
|
||||||
installed separately -- see wordfreq's README for instructions.
|
analyze.
|
||||||
"""
|
"""
|
||||||
suggested_pkg = names[0]
|
if lang == 'ko':
|
||||||
paths = [
|
import mecab_ko_dic
|
||||||
os.path.expanduser('~/.local/lib/mecab/dic'),
|
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
|
||||||
'/var/lib/mecab/dic',
|
elif lang == 'ja':
|
||||||
'/var/local/lib/mecab/dic',
|
import ipadic
|
||||||
'/usr/lib/mecab/dic',
|
return MeCab.Tagger(ipadic.MECAB_ARGS)
|
||||||
'/usr/local/lib/mecab/dic',
|
else:
|
||||||
'/usr/lib/x86_64-linux-gnu/mecab/dic',
|
raise ValueError("Can't run MeCab on language {lang}".format(lang))
|
||||||
]
|
|
||||||
full_paths = [os.path.join(path, name) for path in paths for name in names]
|
|
||||||
checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
|
|
||||||
for path in checked_paths:
|
|
||||||
if os.path.exists(path):
|
|
||||||
return path
|
|
||||||
|
|
||||||
error_lines = [
|
|
||||||
"Couldn't find the MeCab dictionary named %r." % suggested_pkg,
|
|
||||||
"You should download or use your system's package manager to install",
|
|
||||||
"the %r package." % suggested_pkg,
|
|
||||||
"",
|
|
||||||
"We looked in the following locations:"
|
|
||||||
] + ["\t%s" % path for path in checked_paths]
|
|
||||||
|
|
||||||
skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
|
|
||||||
if skipped_paths:
|
|
||||||
error_lines += [
|
|
||||||
"We had to skip these paths that are too long for MeCab to find:",
|
|
||||||
] + ["\t%s" % path for path in skipped_paths]
|
|
||||||
|
|
||||||
raise OSError('\n'.join(error_lines))
|
|
||||||
|
|
||||||
|
|
||||||
def make_mecab_analyzer(names):
|
|
||||||
"""
|
|
||||||
Get a MeCab analyzer object, given a list of names the dictionary might
|
|
||||||
have.
|
|
||||||
"""
|
|
||||||
return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
|
|
||||||
|
|
||||||
|
|
||||||
# Describe how to get the MeCab analyzers for each language.
|
|
||||||
MECAB_DICTIONARY_NAMES = {
|
|
||||||
'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
|
|
||||||
'ko': ['mecab-ko-dic', 'ko-dic']
|
|
||||||
}
|
|
||||||
# The constructed analyzers will go in this dictionary.
|
# The constructed analyzers will go in this dictionary.
|
||||||
MECAB_ANALYZERS = {}
|
MECAB_ANALYZERS = {}
|
||||||
|
|
||||||
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
|
|||||||
contains the same table that the command-line version of MeCab would output.
|
contains the same table that the command-line version of MeCab would output.
|
||||||
We find the tokens in the first column of this table.
|
We find the tokens in the first column of this table.
|
||||||
"""
|
"""
|
||||||
if lang not in MECAB_DICTIONARY_NAMES:
|
|
||||||
raise ValueError("Can't run MeCab on language %r" % lang)
|
|
||||||
if lang not in MECAB_ANALYZERS:
|
if lang not in MECAB_ANALYZERS:
|
||||||
MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
|
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
|
||||||
|
|
||||||
analyzer = MECAB_ANALYZERS[lang]
|
analyzer = MECAB_ANALYZERS[lang]
|
||||||
text = unicodedata.normalize('NFKC', text.strip())
|
text = unicodedata.normalize('NFKC', text.strip())
|
||||||
|
Loading…
Reference in New Issue
Block a user