From 3bd1fe2fe62c7a2bce5a91fd891e633aa93fbaba Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Mon, 28 Sep 2015 12:58:20 -0400
Subject: [PATCH] Fix documentation and clean up, based on Sep 25 code review

Former-commit-id: 44b0c4f9bab48569d9f47219a8ae99f494e3d95d
---
 README.md           | 16 +++++++++-------
 wordfreq/chinese.py | 19 +++++++++++++++++++
 wordfreq/tokens.py  |  1 -
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 3e6cfbc..5146fa3 100644
--- a/README.md
+++ b/README.md
@@ -192,14 +192,16 @@ into multiple tokens:
     3.2187603965715087e-06
 
 The word frequencies are combined with the half-harmonic-mean function in order
-to provide an estimate of what their combined frequency would be. In languages
-written without spaces, there is also a penalty to the word frequency for each
-word break that must be inferred.
+to provide an estimate of what their combined frequency would be. In Chinese,
+where the word breaks must be inferred from the frequency of the resulting
+words, there is also a penalty to the word frequency for each word break that
+must be inferred.
 
-This implicitly assumes that you're asking about words that frequently appear
-together. It's not multiplying the frequencies, because that would assume they
-are statistically unrelated. So if you give it an uncommon combination of
-tokens, it will hugely over-estimate their frequency:
+This method of combining word frequencies implicitly assumes that you're asking
+about words that frequently appear together. It's not multiplying the
+frequencies, because that would assume they are statistically unrelated. So if
+you give it an uncommon combination of tokens, it will hugely over-estimate
+their frequency:
 
     >>> word_frequency('owl-flavored', 'en')
     1.3557098723512335e-06
diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py
index c07e77e..c923f83 100644
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@@ -10,10 +10,29 @@ jieba_tokenizer = None
 
 
 def simplify_chinese(text):
+    """
+    Convert Chinese text character-by-character to Simplified Chinese, for the
+    purpose of looking up word frequencies.
+
+    This is far too simple to be a proper Chinese-to-Chinese "translation"; it
+    will sometimes produce nonsense words by simplifying characters that would
+    not be simplified in context, or by simplifying words that would only be
+    used in a Traditional Chinese locale. But the resulting text is still a
+    reasonable key for looking up word frequenices.
+    """
     return text.translate(SIMPLIFIED_MAP).casefold()
 
 
 def jieba_tokenize(text):
+    """
+    Tokenize the given text into tokens whose word frequencies can probably
+    be looked up. This uses Jieba, a word-frequency-based tokenizer.
+
+    We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
+    to infer unknown words using a hidden Markov model. This ensures that the
+    multi-character tokens that it outputs will be ones whose word frequencies
+    we can look up.
+    """
     global jieba_tokenizer
     if jieba_tokenizer is None:
         jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index b9c156c..c67c302 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -1,6 +1,5 @@
 import regex
 import unicodedata
-from pkg_resources import resource_filename
 
 
 TOKEN_RE = regex.compile(r"""