From 960dc437a2c303dbca5774204df2e7c3cf18dd95 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 24 Sep 2015 17:47:16 -0400
Subject: [PATCH] update and clean up the tokenize() docstring

Former-commit-id: 24b16d8a5dbced484a4f66eb2e853829468b346c
---
 wordfreq/tokens.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index ad64bcd..65a9735 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -127,19 +127,25 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     - Chinese will be mapped to Simplified Chinese characters and tokenized
       using the jieba tokenizer, on a custom word list of words that can be
       looked up in wordfreq.
-    - Japanese will be delegated to the external mecab-python module.
+
+    - Japanese will be delegated to the external mecab-python module. It will
+      be NFKC normalized, which is stronger than NFC normalization.
+
     - Chinese or Japanese texts that aren't identified as the appropriate
       language will only split on punctuation and script boundaries, giving
       you untokenized globs of characters that probably represent many words.
+
+    - Arabic will be NFKC normalized, and will have Arabic-specific combining
+      marks and tatweels removed.
+
+    - Languages written in cased alphabets will be case-folded to lowercase.
+
     - Turkish will use a different case-folding procedure, so that capital
       I and İ map to ı and i respectively.
-    - All other languages will be tokenized using a regex that mostly
-      implements the Word Segmentation section of Unicode Annex #29.
-      See `simple_tokenize` for details.
 
-    Additionally, the text will be case-folded to lowercase, and text marked
-    as Arabic will be normalized more strongly and have combining marks and
-    tatweels removed.
+    - Languages besides Japanese and Chinese will be tokenized using a regex
+      that mostly implements the Word Segmentation section of Unicode Annex
+      #29. See `simple_tokenize` for details.
 
     If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
     not be used for tokenization. Instead, it will use the large wordlist