also NFKC-normalize Japanese input

This commit is contained in:
Rob Speer 2015-08-24 18:13:03 -04:00
parent 1d055edc1c
commit 554455699d

View File

@ -1,4 +1,5 @@
import MeCab
import unicodedata
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@ -14,6 +15,7 @@ def mecab_tokenize(text):
contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table.
"""
text = unicodedata.normalize('NFKC', text.strip())
return [line.split('\t')[0]
for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
for line in MECAB_ANALYZER.parse(text).split('\n')
if line != '' and line != 'EOS']