From 960dc437a2c303dbca5774204df2e7c3cf18dd95 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 24 Sep 2015 17:47:16 -0400 Subject: [PATCH] update and clean up the tokenize() docstring Former-commit-id: 24b16d8a5dbced484a4f66eb2e853829468b346c --- wordfreq/tokens.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index ad64bcd..65a9735 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -127,19 +127,25 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): - Chinese will be mapped to Simplified Chinese characters and tokenized using the jieba tokenizer, on a custom word list of words that can be looked up in wordfreq. - - Japanese will be delegated to the external mecab-python module. + + - Japanese will be delegated to the external mecab-python module. It will + be NFKC normalized, which is stronger than NFC normalization. + - Chinese or Japanese texts that aren't identified as the appropriate language will only split on punctuation and script boundaries, giving you untokenized globs of characters that probably represent many words. + + - Arabic will be NFKC normalized, and will have Arabic-specific combining + marks and tatweels removed. + + - Languages written in cased alphabets will be case-folded to lowercase. + - Turkish will use a different case-folding procedure, so that capital I and İ map to ı and i respectively. - - All other languages will be tokenized using a regex that mostly - implements the Word Segmentation section of Unicode Annex #29. - See `simple_tokenize` for details. - Additionally, the text will be case-folded to lowercase, and text marked - as Arabic will be normalized more strongly and have combining marks and - tatweels removed. + - Languages besides Japanese and Chinese will be tokenized using a regex + that mostly implements the Word Segmentation section of Unicode Annex + #29. See `simple_tokenize` for details. If `external_wordlist` is True, then the Chinese wordlist in wordfreq will not be used for tokenization. Instead, it will use the large wordlist