diff --git a/CHANGELOG.md b/CHANGELOG.md index dc23a9d..5460ffc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## Version 2.4.2 (2021-02-19) + +- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to + be installed separately as system packages. They can now be found via the + Python packages `ipadic` and `mecab-ko-dic`. + +- When the tokenizer had to infer word boundaries in languages without spaces, + inputs that were too long (such as the letter 'l' repeated 800 times) were + causing overflow errors. We changed the sequence of operations so that it + no longer overflows, and such inputs simply get a frequency of 0. + ## Version 2.4.1 (2021-02-09) - Changed a log message to not try to call a language by name, to remove diff --git a/README.md b/README.md index 00c453a..4f31cd7 100644 --- a/README.md +++ b/README.md @@ -381,67 +381,16 @@ Simplified Chinese), you will get the `zh` wordlist, for example. ## Additional CJK installation Chinese, Japanese, and Korean have additional external dependencies so that -they can be tokenized correctly. Here we'll explain how to set them up, -in increasing order of difficulty. +they can be tokenized correctly. They can all be installed at once by requesting +the 'cjk' feature: + pip install wordfreq[cjk] -### Chinese +Tokenizing Chinese depends on the `jieba` package, tokenizing Japanese depends +on `mecab-python` and `ipadic`, and tokenizing Korean depends on `mecab-python` +and `mecab-ko-dic`. -To be able to look up word frequencies in Chinese, you need Jieba, a -pure-Python Chinese tokenizer: - - pip3 install jieba - - -### Japanese - -We use MeCab, by Taku Kudo, to tokenize Japanese. To use this in wordfreq, three -things need to be installed: - - * The MeCab development library (called `libmecab-dev` on Ubuntu) - * The UTF-8 version of the `ipadic` Japanese dictionary - (called `mecab-ipadic-utf8` on Ubuntu) - * The `mecab-python3` Python interface - -To install these three things on Ubuntu, you can run: - -```sh -sudo apt-get install python3-dev libmecab-dev mecab-ipadic-utf8 -pip3 install mecab-python3 -``` - -If you choose to install `ipadic` from somewhere else or from its source code, -be sure it's configured to use UTF-8. By default it will use EUC-JP, which will -give you nonsense results. - - -### Korean - -Korean also uses MeCab, with a Korean dictionary package by Yongwoon Lee and -Yungho Yu. This dictionary is not available as an Ubuntu package. - -Here's a process you can use to install the Korean dictionary and the other -MeCab dependencies: - -```sh -sudo apt-get install libmecab-dev mecab-utils -pip3 install mecab-python3 -wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz -tar xvf mecab-ko-dic-2.0.1-20150920.tar.gz -cd mecab-ko-dic-2.0.1-20150920 -./autogen.sh -./configure -make -sudo make install -``` - -If wordfreq cannot find the Japanese or Korean data for MeCab when asked to -tokenize those languages, it will raise an error and show you the list of -paths it searched. - -Sorry that this is difficult. We tried to just package the data files we need -with wordfreq, like we do for Chinese, but PyPI would reject the package for -being too large. +As of version 2.4.2, you no longer have to install dictionaries separately. ## License diff --git a/setup.py b/setup.py index f68404a..098b07b 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ dependencies = [ setup( name="wordfreq", - version='2.5.0', + version='2.4.2', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', @@ -55,8 +55,12 @@ setup( # # Similarly, jieba is required for Chinese word frequencies. extras_require={ + # previous names for extras 'mecab': ['mecab-python3', 'ipadic', 'mecab-ko-dic'], - 'jieba': ['jieba >= 0.42'] + 'jieba': ['jieba >= 0.42'], + + # get them all at once + 'cjk': ['mecab-python3', 'ipadic', 'mecab-ko-dic', 'jieba >= 0.42'] }, tests_require=['pytest', 'mecab-python3', 'jieba >= 0.42', 'ipadic', 'mecab-ko-dic'], )