removes arabic commas

This commit is contained in:
Joshua Chin 2015-06-25 12:02:59 -04:00
parent 6e1f7e30c6
commit 83797bd276

View File

@ -118,6 +118,10 @@ def tokenize(text, lang):
except NameError: except NameError:
from wordfreq.mecab import mecab_tokenize from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text) return mecab_tokenize(text)
elif lang == 'ar':
tokens = simple_tokenize(text)
tokens = [token.replace('ـ', '') for token in tokens]
return [token for token in tokens if token] # remove empty strings
else: else:
return simple_tokenize(text) return simple_tokenize(text)