From 75a56b68fb19ab9f8a4e5a0bc2b2221cf2e6b463 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 18 Feb 2021 14:44:39 -0500
Subject: [PATCH 1/4] change math for INFERRED_SPACE_FACTOR to not overflow

---
 tests/test_chinese.py | 8 ++++++++
 wordfreq/__init__.py  | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/test_chinese.py b/tests/test_chinese.py
index ce157db..83e2d70 100644
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@@ -77,3 +77,11 @@ def test_alternate_codes():
     # Separate codes for Mandarin and Cantonese
     assert tokenize('谢谢谢谢', 'cmn') == tokens
     assert tokenize('谢谢谢谢', 'yue') == tokens
+
+
+def test_unreasonably_long():
+    # This crashed earlier versions of wordfreq
+    lots_of_ls = 'l' * 800
+    assert word_frequency(lots_of_ls, 'zh') < 1e-300
+    assert zipf_frequency(lots_of_ls, 'zh') == 0.
+
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index bad4c92..17c910a 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
         # If we used the Jieba tokenizer, we could tokenize anything to match
         # our wordlist, even nonsense. To counteract this, we multiply by a
         # probability for each word break that was inferred.
-        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
+        freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
 
     # All our frequency data is only precise to within 1% anyway, so round
     # it to 3 significant digits

From ed23bf3ebe07a1807c413b22db4a1529bc1b3d82 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 18 Feb 2021 15:09:31 -0500
Subject: [PATCH 2/4] specifically test that the long sequence underflows to 0

---
 tests/test_chinese.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/test_chinese.py b/tests/test_chinese.py
index 83e2d70..4bde6c2 100644
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@@ -1,4 +1,4 @@
-from wordfreq import tokenize, word_frequency
+from wordfreq import tokenize, word_frequency, zipf_frequency
 import pytest
 
 
@@ -80,8 +80,10 @@ def test_alternate_codes():
 
 
 def test_unreasonably_long():
-    # This crashed earlier versions of wordfreq
+    # This crashed earlier versions of wordfreq due to an overflow in
+    # exponentiation. We've now changed the sequence of operations so it
+    # will underflow instead.
     lots_of_ls = 'l' * 800
-    assert word_frequency(lots_of_ls, 'zh') < 1e-300
+    assert word_frequency(lots_of_ls, 'zh') == 0.
     assert zipf_frequency(lots_of_ls, 'zh') == 0.
 

From de636a804e078971947d5e9df932b4798bbc3eb5 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 18 Feb 2021 18:18:06 -0500
Subject: [PATCH 3/4] Use Python packages to find dictionaries for MeCab

---
 setup.py          | 10 ++++----
 wordfreq/mecab.py | 61 ++++++++++-------------------------------------
 2 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/setup.py b/setup.py
index e2b1451..f68404a 100755
--- a/setup.py
+++ b/setup.py
@@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
                        encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = [
-    'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
+    'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
 ]
 
 setup(
     name="wordfreq",
-    version='2.4.1',
+    version='2.5.0',
     maintainer='Robyn Speer',
     maintainer_email='rspeer@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',
@@ -55,8 +55,8 @@ setup(
     #
     # Similarly, jieba is required for Chinese word frequencies.
     extras_require={
-        'mecab': 'mecab-python3',
-        'jieba': 'jieba >= 0.42'
+        'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
+        'jieba': ['jieba >= 0.42']
     },
-    tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'],
+    tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
 )
diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py
index 8607387..fee555c 100644
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@@ -8,56 +8,21 @@ import os
 MAX_PATH_LENGTH = 58
 
 
-def find_mecab_dictionary(names):
+def make_mecab_analyzer(lang):
     """
-    Find a MeCab dictionary with a given name. The dictionary has to be
-    installed separately -- see wordfreq's README for instructions.
+    Get a MeCab analyzer object, given the language code of the language to
+    analyze.
     """
-    suggested_pkg = names[0]
-    paths = [
-        os.path.expanduser('~/.local/lib/mecab/dic'),
-        '/var/lib/mecab/dic',
-        '/var/local/lib/mecab/dic',
-        '/usr/lib/mecab/dic',
-        '/usr/local/lib/mecab/dic',
-        '/usr/lib/x86_64-linux-gnu/mecab/dic',
-    ]
-    full_paths = [os.path.join(path, name) for path in paths for name in names]
-    checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
-    for path in checked_paths:
-        if os.path.exists(path):
-            return path
-
-    error_lines = [
-        "Couldn't find the MeCab dictionary named %r." % suggested_pkg,
-        "You should download or use your system's package manager to install",
-        "the %r package." % suggested_pkg,
-        "",
-        "We looked in the following locations:"
-    ] + ["\t%s" % path for path in checked_paths]
-
-    skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
-    if skipped_paths:
-        error_lines += [
-            "We had to skip these paths that are too long for MeCab to find:",
-        ] + ["\t%s" % path for path in skipped_paths]
-
-    raise OSError('\n'.join(error_lines))
+    if lang == 'ko':
+        import mecab_ko_dic
+        return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
+    elif lang == 'ja':
+        import ipadic
+        return MeCab.Tagger(ipadic.MECAB_ARGS)
+    else:
+        raise ValueError("Can't run MeCab on language {lang}".format(lang))
 
 
-def make_mecab_analyzer(names):
-    """
-    Get a MeCab analyzer object, given a list of names the dictionary might
-    have.
-    """
-    return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
-
-
-# Describe how to get the MeCab analyzers for each language.
-MECAB_DICTIONARY_NAMES = {
-    'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
-    'ko': ['mecab-ko-dic', 'ko-dic']
-}
 # The constructed analyzers will go in this dictionary.
 MECAB_ANALYZERS = {}
 
@@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
     contains the same table that the command-line version of MeCab would output.
     We find the tokens in the first column of this table.
     """
-    if lang not in MECAB_DICTIONARY_NAMES:
-        raise ValueError("Can't run MeCab on language %r" % lang)
     if lang not in MECAB_ANALYZERS:
-        MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
+        MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
 
     analyzer = MECAB_ANALYZERS[lang]
     text = unicodedata.normalize('NFKC', text.strip())

From 168bb2a6ed1977e96c0a404cbb1dc6ed192c1d5c Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 18 Feb 2021 18:25:16 -0500
Subject: [PATCH 4/4] fix version, update instructions and changelog

---
 CHANGELOG.md | 11 +++++++++
 README.md    | 65 ++++++----------------------------------------------
 setup.py     |  8 +++++--
 3 files changed, 24 insertions(+), 60 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dc23a9d..5460ffc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## Version 2.4.2 (2021-02-19)
+
+- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to
+  be installed separately as system packages. They can now be found via the
+  Python packages `ipadic` and `mecab-ko-dic`.
+
+- When the tokenizer had to infer word boundaries in languages without spaces,
+  inputs that were too long (such as the letter 'l' repeated 800 times) were
+  causing overflow errors. We changed the sequence of operations so that it
+  no longer overflows, and such inputs simply get a frequency of 0.
+
 ## Version 2.4.1 (2021-02-09)
 
 - Changed a log message to not try to call a language by name, to remove
diff --git a/README.md b/README.md
index 00c453a..4f31cd7 100644
--- a/README.md
+++ b/README.md
@@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example.
 ## Additional CJK installation
 
 Chinese, Japanese, and Korean have additional external dependencies so that
-they can be tokenized correctly. Here we'll explain how to set them up,
-in increasing order of difficulty.
+they can be tokenized correctly. They can all be installed at once by requesting
+the 'cjk' feature:
 
+    pip install wordfreq[cjk]
 
-### Chinese
+Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends
+on `mecab-python` and `ipadic`, and tokenizing Korean depends on `mecab-python`
+and `mecab-ko-dic`.
 
-To be able to look up word frequencies in Chinese, you need Jieba, a
-pure-Python Chinese tokenizer:
-
-    pip3 install jieba
-
-
-### Japanese
-
-We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three
-things need to be installed:
-
-  * The MeCab development library (called `libmecab-dev` on Ubuntu)
-  * The UTF-8 version of the `ipadic` Japanese dictionary
-    (called `mecab-ipadic-utf8` on Ubuntu)
-  * The `mecab-python3` Python interface
-
-To install these three things on Ubuntu, you can run:
-
-```sh
-sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8
-pip3 install mecab-python3
-```
-
-If you choose to install `ipadic` from somewhere else or from its source code,
-be sure it's configured to use UTF-8. By default it will use EUC-JP, which will
-give you nonsense results.
-
-
-### Korean
-
-Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and
-Yungho Yu. This dictionary is not available as an Ubuntu package.
-
-Here's a process you can use to install the Korean dictionary and the other
-MeCab dependencies:
-
-```sh
-sudo apt-get install libmecab-dev mecab-utils
-pip3 install mecab-python3
-wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
-tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz
-cd mecab-ko-dic-2.0.1-20150920
-./autogen.sh
-./configure
-make
-sudo make install
-```
-
-If wordfreq cannot find the Japanese or Korean data for MeCab when asked to
-tokenize those languages, it will raise an error and show you the list of
-paths it searched.
-
-Sorry that this is difficult. We tried to just package the data files we need
-with wordfreq, like we do for Chinese, but PyPI would reject the package for
-being too large.
+As of version 2.4.2, you no longer have to install dictionaries separately.
 
 
 ## License
diff --git a/setup.py b/setup.py
index f68404a..098b07b 100755
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@ dependencies = [
 
 setup(
     name="wordfreq",
-    version='2.5.0',
+    version='2.4.2',
     maintainer='Robyn Speer',
     maintainer_email='rspeer@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',
@@ -55,8 +55,12 @@ setup(
     #
     # Similarly, jieba is required for Chinese word frequencies.
     extras_require={
+        # previous names for extras
         'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
-        'jieba': ['jieba >= 0.42']
+        'jieba': ['jieba >= 0.42'],
+
+        # get them all at once
+        'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42']
     },
     tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
 )