removes arabic commas

Former-commit-id: 83797bd276
This commit is contained in:
Joshua Chin 2015-06-25 12:02:59 -04:00
parent 78bff813e3
commit 60782d3796

View File

@ -118,6 +118,10 @@ def tokenize(text, lang):
except NameError:
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
elif lang == 'ar':
tokens = simple_tokenize(text)
tokens = [token.replace('ـ', '') for token in tokens]
return [token for token in tokens if token] # remove empty strings
else:
return simple_tokenize(text)