mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
also NFKC-normalize Japanese input
This commit is contained in:
parent
1d055edc1c
commit
554455699d
@ -1,4 +1,5 @@
|
|||||||
import MeCab
|
import MeCab
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
||||||
@ -14,6 +15,7 @@ def mecab_tokenize(text):
|
|||||||
contains the same table that the command-line version of MeCab would output.
|
contains the same table that the command-line version of MeCab would output.
|
||||||
We find the tokens in the first column of this table.
|
We find the tokens in the first column of this table.
|
||||||
"""
|
"""
|
||||||
|
text = unicodedata.normalize('NFKC', text.strip())
|
||||||
return [line.split('\t')[0]
|
return [line.split('\t')[0]
|
||||||
for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
|
for line in MECAB_ANALYZER.parse(text).split('\n')
|
||||||
if line != '' and line != 'EOS']
|
if line != '' and line != 'EOS']
|
||||||
|
Loading…
Reference in New Issue
Block a user