diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 5130d0f..f82a39c 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -38,7 +38,7 @@ def simple_tokenize(text):
     by the regex package, except that it leaves Chinese and Japanese
     relatively untokenized.
     """
-    text = unicodedata.normalize('NFKC', text)
+    text = unicodedata.normalize('NFC', text)
     return [token.casefold() for token in TOKEN_RE.findall(text)]
 
 
@@ -70,7 +70,8 @@ def tokenize(text, lang):
     - All other languages will be tokenized according to UTR #29.
 
     Additionally, the text will be case-folded to lowercase, and text marked
-    as Arabic will have combining marks and tatweels removed.
+    as Arabic will be normalized more strongly and have combining marks and
+    tatweels removed.
 
     Strings that are looked up in wordfreq will be run through this function
     first, so that they can be expected to match the data.
@@ -82,7 +83,7 @@ def tokenize(text, lang):
         return mecab_tokenize(text)
 
     if lang == 'ar':
-        text = remove_arabic_marks(text)
+        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
 
     return simple_tokenize(text)