mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
also NFKC-normalize Japanese input
This commit is contained in:
parent
1d055edc1c
commit
554455699d
@ -1,4 +1,5 @@
|
||||
import MeCab
|
||||
import unicodedata
|
||||
|
||||
|
||||
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
||||
@ -14,6 +15,7 @@ def mecab_tokenize(text):
|
||||
contains the same table that the command-line version of MeCab would output.
|
||||
We find the tokens in the first column of this table.
|
||||
"""
|
||||
text = unicodedata.normalize('NFKC', text.strip())
|
||||
return [line.split('\t')[0]
|
||||
for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
|
||||
for line in MECAB_ANALYZER.parse(text).split('\n')
|
||||
if line != '' and line != 'EOS']
|
||||
|
Loading…
Reference in New Issue
Block a user