mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
parent
93681e43b3
commit
c5135edd88
@ -73,7 +73,6 @@ def simple_tokenize(text):
|
|||||||
"""
|
"""
|
||||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
mecab_tokenize = None
|
|
||||||
def tokenize(text, lang):
|
def tokenize(text, lang):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's straightforward but appropriate for
|
Tokenize this text in a way that's straightforward but appropriate for
|
||||||
@ -87,10 +86,7 @@ def tokenize(text, lang):
|
|||||||
first, so that they can be expected to match the data.
|
first, so that they can be expected to match the data.
|
||||||
"""
|
"""
|
||||||
if lang == 'ja':
|
if lang == 'ja':
|
||||||
global mecab_tokenize
|
from wordfreq.mecab import mecab_tokenize
|
||||||
if mecab_tokenize is None:
|
|
||||||
from wordfreq.mecab import mecab_tokenize
|
|
||||||
return mecab_tokenize(text)
|
|
||||||
|
|
||||||
if lang == 'ar':
|
if lang == 'ar':
|
||||||
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||||
|
Loading…
Reference in New Issue
Block a user