also NFKC-normalize Japanese input

This commit is contained in:
Rob Speer 2015-08-24 18:13:03 -04:00
parent 1d055edc1c
commit 554455699d

View File

@ -1,4 +1,5 @@
import MeCab import MeCab
import unicodedata
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@ -14,6 +15,7 @@ def mecab_tokenize(text):
contains the same table that the command-line version of MeCab would output. contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table. We find the tokens in the first column of this table.
""" """
text = unicodedata.normalize('NFKC', text.strip())
return [line.split('\t')[0] return [line.split('\t')[0]
for line in MECAB_ANALYZER.parse(text.strip()).split('\n') for line in MECAB_ANALYZER.parse(text).split('\n')
if line != '' and line != 'EOS'] if line != '' and line != 'EOS']