only NFKC-normalize in Arabic

Former-commit-id: 1d055edc1c
2024-12-23 17:31:41 +00:00 · 2015-08-24 17:55:17 -04:00 · 2015-08-24 17:55:17 -04:00 · 13096b26bd
commit 13096b26bd
parent 4ec128adae
1 changed files with 4 additions and 3 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -38,7 +38,7 @@ def simple_tokenize(text):
    by the regex package, except that it leaves Chinese and Japanese
    relatively untokenized.
    """
-    text = unicodedata.normalize('NFKC', text)
+    text = unicodedata.normalize('NFC', text)
    return [token.casefold() for token in TOKEN_RE.findall(text)]
@ -70,7 +70,8 @@ def tokenize(text, lang):
    - All other languages will be tokenized according to UTR #29.
    Additionally, the text will be case-folded to lowercase, and text marked
-    as Arabic will have combining marks and tatweels removed.
+    as Arabic will be normalized more strongly and have combining marks and
    tatweels removed.
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
@ -82,7 +83,7 @@ def tokenize(text, lang):
        return mecab_tokenize(text)
    if lang == 'ar':
-        text = remove_arabic_marks(text)
+        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
    return simple_tokenize(text)