only NFKC-normalize in Arabic

2024-12-24 01:41:39 +00:00 · 2015-08-24 17:55:17 -04:00 · 2015-08-24 17:55:17 -04:00 · 1d055edc1c
commit 1d055edc1c
parent 140ca6c050
1 changed files with 4 additions and 3 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -38,7 +38,7 @@ def simple_tokenize(text):
    by the regex package, except that it leaves Chinese and Japanese
    relatively untokenized.
    """
-    text = unicodedata.normalize('NFKC', text)
+    text = unicodedata.normalize('NFC', text)
    return [token.casefold() for token in TOKEN_RE.findall(text)]
@ -70,7 +70,8 @@ def tokenize(text, lang):
    - All other languages will be tokenized according to UTR #29.
    Additionally, the text will be case-folded to lowercase, and text marked
-    as Arabic will have combining marks and tatweels removed.
+    as Arabic will be normalized more strongly and have combining marks and
    tatweels removed.
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
@ -82,7 +83,7 @@ def tokenize(text, lang):
        return mecab_tokenize(text)
    if lang == 'ar':
-        text = remove_arabic_marks(text)
+        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
    return simple_tokenize(text)