mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
parent
78bff813e3
commit
60782d3796
@ -118,6 +118,10 @@ def tokenize(text, lang):
|
|||||||
except NameError:
|
except NameError:
|
||||||
from wordfreq.mecab import mecab_tokenize
|
from wordfreq.mecab import mecab_tokenize
|
||||||
return mecab_tokenize(text)
|
return mecab_tokenize(text)
|
||||||
|
elif lang == 'ar':
|
||||||
|
tokens = simple_tokenize(text)
|
||||||
|
tokens = [token.replace('ـ', '') for token in tokens]
|
||||||
|
return [token for token in tokens if token] # remove empty strings
|
||||||
else:
|
else:
|
||||||
return simple_tokenize(text)
|
return simple_tokenize(text)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user