only NFKC-normalize in Arabic

This commit is contained in:
Rob Speer 2015-08-24 17:55:17 -04:00
parent 140ca6c050
commit 1d055edc1c

View File

@ -38,7 +38,7 @@ def simple_tokenize(text):
by the regex package, except that it leaves Chinese and Japanese by the regex package, except that it leaves Chinese and Japanese
relatively untokenized. relatively untokenized.
""" """
text = unicodedata.normalize('NFKC', text) text = unicodedata.normalize('NFC', text)
return [token.casefold() for token in TOKEN_RE.findall(text)] return [token.casefold() for token in TOKEN_RE.findall(text)]
@ -70,7 +70,8 @@ def tokenize(text, lang):
- All other languages will be tokenized according to UTR #29. - All other languages will be tokenized according to UTR #29.
Additionally, the text will be case-folded to lowercase, and text marked Additionally, the text will be case-folded to lowercase, and text marked
as Arabic will have combining marks and tatweels removed. as Arabic will be normalized more strongly and have combining marks and
tatweels removed.
Strings that are looked up in wordfreq will be run through this function Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data. first, so that they can be expected to match the data.
@ -82,7 +83,7 @@ def tokenize(text, lang):
return mecab_tokenize(text) return mecab_tokenize(text)
if lang == 'ar': if lang == 'ar':
text = remove_arabic_marks(text) text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text) return simple_tokenize(text)