From 75a56b68fb19ab9f8a4e5a0bc2b2221cf2e6b463 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 18 Feb 2021 14:44:39 -0500 Subject: [PATCH 1/4] change math for INFERRED_SPACE_FACTOR to not overflow --- tests/test_chinese.py | 8 ++++++++ wordfreq/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_chinese.py b/tests/test_chinese.py index ce157db..83e2d70 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -77,3 +77,11 @@ def test_alternate_codes(): # Separate codes for Mandarin and Cantonese assert tokenize('谢谢谢谢', 'cmn') == tokens assert tokenize('谢谢谢谢', 'yue') == tokens + + +def test_unreasonably_long(): + # This crashed earlier versions of wordfreq + lots_of_ls = 'l' * 800 + assert word_frequency(lots_of_ls, 'zh') < 1e-300 + assert zipf_frequency(lots_of_ls, 'zh') == 0. + diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index bad4c92..17c910a 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum): # If we used the Jieba tokenizer, we could tokenize anything to match # our wordlist, even nonsense. To counteract this, we multiply by a # probability for each word break that was inferred. - freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) + freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1) # All our frequency data is only precise to within 1% anyway, so round # it to 3 significant digits From ed23bf3ebe07a1807c413b22db4a1529bc1b3d82 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 18 Feb 2021 15:09:31 -0500 Subject: [PATCH 2/4] specifically test that the long sequence underflows to 0 --- tests/test_chinese.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_chinese.py b/tests/test_chinese.py index 83e2d70..4bde6c2 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -1,4 +1,4 @@ -from wordfreq import tokenize, word_frequency +from wordfreq import tokenize, word_frequency, zipf_frequency import pytest @@ -80,8 +80,10 @@ def test_alternate_codes(): def test_unreasonably_long(): - # This crashed earlier versions of wordfreq + # This crashed earlier versions of wordfreq due to an overflow in + # exponentiation. We've now changed the sequence of operations so it + # will underflow instead. lots_of_ls = 'l' * 800 - assert word_frequency(lots_of_ls, 'zh') < 1e-300 + assert word_frequency(lots_of_ls, 'zh') == 0. assert zipf_frequency(lots_of_ls, 'zh') == 0. From de636a804e078971947d5e9df932b4798bbc3eb5 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 18 Feb 2021 18:18:06 -0500 Subject: [PATCH 3/4] Use Python packages to find dictionaries for MeCab --- setup.py | 10 ++++---- wordfreq/mecab.py | 61 ++++++++++------------------------------------- 2 files changed, 17 insertions(+), 54 deletions(-) diff --git a/setup.py b/setup.py index e2b1451..f68404a 100755 --- a/setup.py +++ b/setup.py @@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = [ - 'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04' + 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04' ] setup( name="wordfreq", - version='2.4.1', + version='2.5.0', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', @@ -55,8 +55,8 @@ setup( # # Similarly, jieba is required for Chinese word frequencies. extras_require={ - 'mecab': 'mecab-python3', - 'jieba': 'jieba >= 0.42' + 'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'], + 'jieba': ['jieba >= 0.42'] }, - tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'], + tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'], ) diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index 8607387..fee555c 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -8,56 +8,21 @@ import os MAX_PATH_LENGTH = 58 -def find_mecab_dictionary(names): +def make_mecab_analyzer(lang): """ - Find a MeCab dictionary with a given name. The dictionary has to be - installed separately -- see wordfreq's README for instructions. + Get a MeCab analyzer object, given the language code of the language to + analyze. """ - suggested_pkg = names[0] - paths = [ - os.path.expanduser('~/.local/lib/mecab/dic'), - '/var/lib/mecab/dic', - '/var/local/lib/mecab/dic', - '/usr/lib/mecab/dic', - '/usr/local/lib/mecab/dic', - '/usr/lib/x86_64-linux-gnu/mecab/dic', - ] - full_paths = [os.path.join(path, name) for path in paths for name in names] - checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH] - for path in checked_paths: - if os.path.exists(path): - return path - - error_lines = [ - "Couldn't find the MeCab dictionary named %r." % suggested_pkg, - "You should download or use your system's package manager to install", - "the %r package." % suggested_pkg, - "", - "We looked in the following locations:" - ] + ["\t%s" % path for path in checked_paths] - - skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH] - if skipped_paths: - error_lines += [ - "We had to skip these paths that are too long for MeCab to find:", - ] + ["\t%s" % path for path in skipped_paths] - - raise OSError('\n'.join(error_lines)) + if lang == 'ko': + import mecab_ko_dic + return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS) + elif lang == 'ja': + import ipadic + return MeCab.Tagger(ipadic.MECAB_ARGS) + else: + raise ValueError("Can't run MeCab on language {lang}".format(lang)) -def make_mecab_analyzer(names): - """ - Get a MeCab analyzer object, given a list of names the dictionary might - have. - """ - return MeCab.Tagger('-d %s' % find_mecab_dictionary(names)) - - -# Describe how to get the MeCab analyzers for each language. -MECAB_DICTIONARY_NAMES = { - 'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'], - 'ko': ['mecab-ko-dic', 'ko-dic'] -} # The constructed analyzers will go in this dictionary. MECAB_ANALYZERS = {} @@ -71,10 +36,8 @@ def mecab_tokenize(text, lang): contains the same table that the command-line version of MeCab would output. We find the tokens in the first column of this table. """ - if lang not in MECAB_DICTIONARY_NAMES: - raise ValueError("Can't run MeCab on language %r" % lang) if lang not in MECAB_ANALYZERS: - MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang]) + MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang) analyzer = MECAB_ANALYZERS[lang] text = unicodedata.normalize('NFKC', text.strip()) From 168bb2a6ed1977e96c0a404cbb1dc6ed192c1d5c Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 18 Feb 2021 18:25:16 -0500 Subject: [PATCH 4/4] fix version, update instructions and changelog --- CHANGELOG.md | 11 +++++++++ README.md | 65 ++++++---------------------------------------------- setup.py | 8 +++++-- 3 files changed, 24 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc23a9d..5460ffc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## Version 2.4.2 (2021-02-19) + +- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to + be installed separately as system packages. They can now be found via the + Python packages `ipadic` and `mecab-ko-dic`. + +- When the tokenizer had to infer word boundaries in languages without spaces, + inputs that were too long (such as the letter 'l' repeated 800 times) were + causing overflow errors. We changed the sequence of operations so that it + no longer overflows, and such inputs simply get a frequency of 0. + ## Version 2.4.1 (2021-02-09) - Changed a log message to not try to call a language by name, to remove diff --git a/README.md b/README.md index 00c453a..4f31cd7 100644 --- a/README.md +++ b/README.md @@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example. ## Additional CJK installation Chinese, Japanese, and Korean have additional external dependencies so that -they can be tokenized correctly. Here we'll explain how to set them up, -in increasing order of difficulty. +they can be tokenized correctly. They can all be installed at once by requesting +the 'cjk' feature: + pip install wordfreq[cjk] -### Chinese +Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends +on `mecab-python` and `ipadic`, and tokenizing Korean depends on `mecab-python` +and `mecab-ko-dic`. -To be able to look up word frequencies in Chinese, you need Jieba, a -pure-Python Chinese tokenizer: - - pip3 install jieba - - -### Japanese - -We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three -things need to be installed: - - * The MeCab development library (called `libmecab-dev` on Ubuntu) - * The UTF-8 version of the `ipadic` Japanese dictionary - (called `mecab-ipadic-utf8` on Ubuntu) - * The `mecab-python3` Python interface - -To install these three things on Ubuntu, you can run: - -```sh -sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8 -pip3 install mecab-python3 -``` - -If you choose to install `ipadic` from somewhere else or from its source code, -be sure it's configured to use UTF-8. By default it will use EUC-JP, which will -give you nonsense results. - - -### Korean - -Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and -Yungho Yu. This dictionary is not available as an Ubuntu package. - -Here's a process you can use to install the Korean dictionary and the other -MeCab dependencies: - -```sh -sudo apt-get install libmecab-dev mecab-utils -pip3 install mecab-python3 -wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz -tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz -cd mecab-ko-dic-2.0.1-20150920 -./autogen.sh -./configure -make -sudo make install -``` - -If wordfreq cannot find the Japanese or Korean data for MeCab when asked to -tokenize those languages, it will raise an error and show you the list of -paths it searched. - -Sorry that this is difficult. We tried to just package the data files we need -with wordfreq, like we do for Chinese, but PyPI would reject the package for -being too large. +As of version 2.4.2, you no longer have to install dictionaries separately. ## License diff --git a/setup.py b/setup.py index f68404a..098b07b 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ dependencies = [ setup( name="wordfreq", - version='2.5.0', + version='2.4.2', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', @@ -55,8 +55,12 @@ setup( # # Similarly, jieba is required for Chinese word frequencies. extras_require={ + # previous names for extras 'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'], - 'jieba': ['jieba >= 0.42'] + 'jieba': ['jieba >= 0.42'], + + # get them all at once + 'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42'] }, tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'], )