only NFKC-normalize in Arabic

2024-12-23 17:31:41 +00:00 · 2015-08-24 17:55:17 -04:00 · 2015-08-24 17:55:17 -04:00 · 1d055edc1c
commit 1d055edc1c
parent 140ca6c050
1 changed files with 4 additions and 3 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -38,7 +38,7 @@ def simple_tokenize(text):
    by the regex package, except that it leaves Chinese and Japanese
    relatively untokenized.
    """
-    text = unicodedata.normalize('NFKC', text)
+    text = unicodedata.normalize('NFC', text)
    return [token.casefold() for token in TOKEN_RE.findall(text)]


@ -70,7 +70,8 @@ def tokenize(text, lang):
    - All other languages will be tokenized according to UTR #29.

    Additionally, the text will be case-folded to lowercase, and text marked
-    as Arabic will have combining marks and tatweels removed.
+    as Arabic will be normalized more strongly and have combining marks and
+    tatweels removed.

    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
@ -82,7 +83,7 @@ def tokenize(text, lang):
        return mecab_tokenize(text)

    if lang == 'ar':
-        text = remove_arabic_marks(text)
+        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))

    return simple_tokenize(text)