factored out fixing arabic

This commit is contained in:
Joshua Chin 2015-07-17 15:39:12 -04:00
parent 303bd88ba2
commit 4e3a5263c3

View File

@ -66,11 +66,18 @@ def tokenize(text, lang):
return mecab_tokenize(text)
if lang == 'ar':
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
text = standardize_arabic(text)
return simple_tokenize(text)
def standardize_arabic(text):
"""
Standardizes arabic text by removing combining marks and tatweels.
"""
return COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
def read_cBpack(filename):
"""
Read a file from an idiosyncratic format that we use for storing