From e1d6e7d96fd0bc5a8a4d36f5e080394699ded660 Mon Sep 17 00:00:00 2001 From: Robyn Speer <rspeer@luminoso.com> Date: Fri, 19 Aug 2016 11:41:35 -0400 Subject: [PATCH 1/2] Allow MeCab to work in Japanese or Korean without the other --- wordfreq/mecab.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index c0c8cfc..aca75d2 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -52,11 +52,13 @@ def make_mecab_analyzer(names): return MeCab.Tagger('-d %s' % find_mecab_dictionary(names)) -# Instantiate the MeCab analyzers for each language. -MECAB_ANALYZERS = { - 'ja': make_mecab_analyzer(['mecab-ipadic-utf8', 'ipadic-utf8']), - 'ko': make_mecab_analyzer(['mecab-ko-dic', 'ko-dic']) +# Describe how to get the MeCab analyzers for each language. +MECAB_DICTIONARY_NAMES = { + 'ja': ['mecab-ipadic-utf8', 'ipadic-utf8'], + 'ko': ['mecab-ko-dic', 'ko-dic'] } +# The constructed analyzers will go in this dictionary. +MECAB_ANALYZERS = {} def mecab_tokenize(text, lang): @@ -68,8 +70,11 @@ def mecab_tokenize(text, lang): contains the same table that the command-line version of MeCab would output. We find the tokens in the first column of this table. """ - if lang not in MECAB_ANALYZERS: + if lang not in MECAB_DICTIONARY_NAMES: raise ValueError("Can't run MeCab on language %r" % lang) + if lang not in MECAB_ANALYZERS: + MECAB_ANALYZERS[lang] = make_mecab_analyzer(MECAB_DICTIONARY_NAMES[lang]) + analyzer = MECAB_ANALYZERS[lang] text = unicodedata.normalize('NFKC', text.strip()) analyzed = analyzer.parse(text) From aa880bcd84272009061bfcf94e977f7e7a54885e Mon Sep 17 00:00:00 2001 From: Robyn Speer <rspeer@luminoso.com> Date: Fri, 19 Aug 2016 11:42:29 -0400 Subject: [PATCH 2/2] bump version to 1.5.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8287286..593c435 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='1.5', + version='1.5.1', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/',