also NFKC-normalize Japanese input

Former-commit-id: 554455699d
This commit is contained in:
Robyn Speer 2015-08-24 18:13:03 -04:00
parent 13096b26bd
commit 7bdfb74720

View File

@ -1,4 +1,5 @@
import MeCab import MeCab
import unicodedata
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@ -14,6 +15,7 @@ def mecab_tokenize(text):
contains the same table that the command-line version of MeCab would output. contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table. We find the tokens in the first column of this table.
""" """
text = unicodedata.normalize('NFKC', text.strip())
return [line.split('\t')[0] return [line.split('\t')[0]
for line in MECAB_ANALYZER.parse(text.strip()).split('\n') for line in MECAB_ANALYZER.parse(text).split('\n')
if line != '' and line != 'EOS'] if line != '' and line != 'EOS']