From 7bdfb7472090c54ef1b2754128e956be2de5a8fb Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Mon, 24 Aug 2015 18:13:03 -0400 Subject: [PATCH] also NFKC-normalize Japanese input Former-commit-id: 554455699d4afcd8c1c2c7b8a33ad9fe7985907e --- wordfreq/mecab.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index 379255b..9ee3b82 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -1,4 +1,5 @@ import MeCab +import unicodedata # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a @@ -14,6 +15,7 @@ def mecab_tokenize(text): contains the same table that the command-line version of MeCab would output. We find the tokens in the first column of this table. """ + text = unicodedata.normalize('NFKC', text.strip()) return [line.split('\t')[0] - for line in MECAB_ANALYZER.parse(text.strip()).split('\n') + for line in MECAB_ANALYZER.parse(text).split('\n') if line != '' and line != 'EOS']