From 554455699d4afcd8c1c2c7b8a33ad9fe7985907e Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Mon, 24 Aug 2015 18:13:03 -0400
Subject: [PATCH] also NFKC-normalize Japanese input

---
 wordfreq/mecab.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py
index 379255b..9ee3b82 100644
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@@ -1,4 +1,5 @@
 import MeCab
+import unicodedata
 
 
 # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@@ -14,6 +15,7 @@ def mecab_tokenize(text):
     contains the same table that the command-line version of MeCab would output.
     We find the tokens in the first column of this table.
     """
+    text = unicodedata.normalize('NFKC', text.strip())
     return [line.split('\t')[0]
-            for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
+            for line in MECAB_ANALYZER.parse(text).split('\n')
             if line != '' and line != 'EOS']