From 119618226df298dbd2aa7fa9c4c9bfb22483aa0c Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Fri, 17 Jul 2015 15:39:12 -0400 Subject: [PATCH] factored out fixing arabic Former-commit-id: 4e3a5263c34e8660742e8382085ca3a7582dc4d2 --- wordfreq/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 07daf9f..9867e89 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -66,11 +66,18 @@ def tokenize(text, lang): return mecab_tokenize(text) if lang == 'ar': - text = COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + text = standardize_arabic(text) return simple_tokenize(text) +def standardize_arabic(text): + """ + Standardizes arabic text by removing combining marks and tatweels. + """ + return COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + + def read_cBpack(filename): """ Read a file from an idiosyncratic format that we use for storing