mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
factored out fixing arabic
This commit is contained in:
parent
303bd88ba2
commit
4e3a5263c3
@ -66,11 +66,18 @@ def tokenize(text, lang):
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
text = standardize_arabic(text)
|
||||
|
||||
return simple_tokenize(text)
|
||||
|
||||
|
||||
def standardize_arabic(text):
|
||||
"""
|
||||
Standardizes arabic text by removing combining marks and tatweels.
|
||||
"""
|
||||
return COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
|
||||
|
||||
def read_cBpack(filename):
|
||||
"""
|
||||
Read a file from an idiosyncratic format that we use for storing
|
||||
|
Loading…
Reference in New Issue
Block a user