Merge pull request #89 from LuminosoInsight/dependencies-and-tokens

Rework CJK dependencies and fix a tokenization bug
2024-12-23 09:21:37 +00:00 · 2021-02-23 15:15:17 -05:00 · 2021-02-23 15:15:17 -05:00 · 32093d9efc
commit 32093d9efc
parent 7318f58df9 168bb2a6ed
6 changed files with 51 additions and 114 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,14 @@
 ## Version 2.4.2 (2021-02-19)
 - When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to
  be installed separately as system packages. They can now be found via the
  Python packages `ipadic` and `mecab-ko-dic`.
 - When the tokenizer had to infer word boundaries in languages without spaces,
  inputs that were too long (such as the letter 'l' repeated 800 times) were
  causing overflow errors. We changed the sequence of operations so that it
  no longer overflows, and such inputs simply get a frequency of 0.
 ## Version 2.4.1 (2021-02-09)
 - Changed a log message to not try to call a language by name, to remove
--- a/README.md
+++ b/README.md
@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example.
 ## Additional CJK installation
 Chinese, Japanese, and Korean have additional external dependencies so that
-they can be tokenized correctly. Here we'll explain how to set them up,
+they can be tokenized correctly. They can all be installed at once by requesting
-in increasing order of difficulty.
+the 'cjk' feature:
    pip install wordfreq[cjk]
-### Chinese
+Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends
 on `mecab-python` and `ipadic`, and tokenizing Korean depends on `mecab-python`
 and `mecab-ko-dic`.
-To be able to look up word frequencies in Chinese, you need Jieba, a
+As of version 2.4.2, you no longer have to install dictionaries separately.
 pure-Python Chinese tokenizer:
    pip3 install jieba
 ### Japanese
 We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three
 things need to be installed:
  * The MeCab development library (called `libmecab-dev` on Ubuntu)
  * The UTF-8 version of the `ipadic` Japanese dictionary
    (called `mecab-ipadic-utf8` on Ubuntu)
  * The `mecab-python3` Python interface
 To install these three things on Ubuntu, you can run:
 ```sh
 sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8
 pip3 install mecab-python3
 ```
 If you choose to install `ipadic` from somewhere else or from its source code,
 be sure it's configured to use UTF-8. By default it will use EUC-JP, which will
 give you nonsense results.
 ### Korean
 Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and
 Yungho Yu. This dictionary is not available as an Ubuntu package.
 Here's a process you can use to install the Korean dictionary and the other
 MeCab dependencies:
 ```sh
 sudo apt-get install libmecab-dev mecab-utils
 pip3 install mecab-python3
 wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
 tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz
 cd mecab-ko-dic-2.0.1-20150920
 ./autogen.sh
 ./configure
 make
 sudo make install
 ```
 If wordfreq cannot find the Japanese or Korean data for MeCab when asked to
 tokenize those languages, it will raise an error and show you the list of
 paths it searched.
 Sorry that this is difficult. We tried to just package the data files we need
 with wordfreq, like we do for Chinese, but PyPI would reject the package for
 being too large.
 ## License
--- a/setup.py
+++ b/setup.py
@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
                       encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = [
-    'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
+    'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
 ]
 setup(
    name="wordfreq",
-    version='2.4.1',
+    version='2.4.2',
    maintainer='Robyn Speer',
    maintainer_email='rspeer@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -55,8 +55,12 @@ setup(
    #
    # Similarly, jieba is required for Chinese word frequencies.
    extras_require={
-        'mecab': 'mecab-python3',
+        # previous names for extras
-        'jieba': 'jieba >= 0.42'
+        'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'],
        'jieba': ['jieba >= 0.42'],
        # get them all at once
        'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42']
    },
-    tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42'],
+    tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'],
 )
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -1,4 +1,4 @@
-from wordfreq import tokenize, word_frequency
+from wordfreq import tokenize, word_frequency, zipf_frequency
 import pytest
@ -77,3 +77,13 @@ def test_alternate_codes():
    # Separate codes for Mandarin and Cantonese
    assert tokenize('谢谢谢谢', 'cmn') == tokens
    assert tokenize('谢谢谢谢', 'yue') == tokens
 def test_unreasonably_long():
    # This crashed earlier versions of wordfreq due to an overflow in
    # exponentiation. We've now changed the sequence of operations so it
    # will underflow instead.
    lots_of_ls = 'l' * 800
    assert word_frequency(lots_of_ls, 'zh') == 0.
    assert zipf_frequency(lots_of_ls, 'zh') == 0.
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -263,7 +263,7 @@ def _word_frequency(word, lang, wordlist, minimum):
        # If we used the Jieba tokenizer, we could tokenize anything to match
        # our wordlist, even nonsense. To counteract this, we multiply by a
        # probability for each word break that was inferred.
-        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
+        freq *= INFERRED_SPACE_FACTOR ** -(len(tokens) - 1)
    # All our frequency data is only precise to within 1% anyway, so round
    # it to 3 significant digits
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -8,56 +8,21 @@ import os
 MAX_PATH_LENGTH = 58
-def find_mecab_dictionary(names):
+def make_mecab_analyzer(lang):
    """
-    Find a MeCab dictionary with a given name. The dictionary has to be
+    Get a MeCab analyzer object, given the language code of the language to
-    installed separately -- see wordfreq's README for instructions.
+    analyze.
    """
-    suggested_pkg = names[0]
+    if lang == 'ko':
-    paths = [
+        import mecab_ko_dic
-        os.path.expanduser('~/.local/lib/mecab/dic'),
+        return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
-        '/var/lib/mecab/dic',
+    elif lang == 'ja':
-        '/var/local/lib/mecab/dic',
+        import ipadic
-        '/usr/lib/mecab/dic',
+        return MeCab.Tagger(ipadic.MECAB_ARGS)
-        '/usr/local/lib/mecab/dic',
+    else:
-        '/usr/lib/x86_64-linux-gnu/mecab/dic',
+        raise ValueError("Can't run MeCab on language {lang}".format(lang))
    ]
    full_paths = [os.path.join(path, name) for path in paths for name in names]
    checked_paths = [path for path in full_paths if len(path) <= MAX_PATH_LENGTH]
    for path in checked_paths:
        if os.path.exists(path):
            return path
    error_lines = [
        "Couldn't find the MeCab dictionary named %r." % suggested_pkg,
        "You should download or use your system's package manager to install",
        "the %r package." % suggested_pkg,
        "",
        "We looked in the following locations:"
    ] + ["\t%s" % path for path in checked_paths]
    skipped_paths = [path for path in full_paths if len(path) > MAX_PATH_LENGTH]
    if skipped_paths:
        error_lines += [
            "We had to skip these paths that are too long for MeCab to find:",
        ] + ["\t%s" % path for path in skipped_paths]
    raise OSError('\n'.join(error_lines))
 def make_mecab_analyzer(names):
    """
    Get a MeCab analyzer object, given a list of names the dictionary might
    have.
    """
    return MeCab.Tagger('-d %s' % find_mecab_dictionary(names))
 # Describe how to get the MeCab analyzers for each language.
 MECAB_DICTIONARY_NAMES = {
    'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'],
    'ko': ['mecab-ko-dic', 'ko-dic']
 }
 # The constructed analyzers will go in this dictionary.
 MECAB_ANALYZERS = {}
@ -71,10 +36,8 @@ def mecab_tokenize(text, lang):
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
    if lang not in MECAB_DICTIONARY_NAMES:
        raise ValueError("Can't run MeCab on language %r" % lang)
    if lang not in MECAB_ANALYZERS:
-        MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang])
+        MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
    analyzer = MECAB_ANALYZERS[lang]
    text = unicodedata.normalize('NFKC', text.strip())