Merge pull request #29 from LuminosoInsight/code-review-notes-20150925

Fix documentation and clean up, based on Sep 25 code review
2024-12-24 09:51:38 +00:00 · 2015-09-28 13:53:50 -04:00 · 2015-09-28 13:53:50 -04:00 · 15d99be21b
commit 15d99be21b
parent cd0797e1c8 44b0c4f9ba
3 changed files with 28 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -192,14 +192,16 @@ into multiple tokens:
    3.2187603965715087e-06
 The word frequencies are combined with the half-harmonic-mean function in order
-to provide an estimate of what their combined frequency would be. In languages
+to provide an estimate of what their combined frequency would be. In Chinese,
-written without spaces, there is also a penalty to the word frequency for each
+where the word breaks must be inferred from the frequency of the resulting
-word break that must be inferred.
+words, there is also a penalty to the word frequency for each word break that
 must be inferred.
-This implicitly assumes that you're asking about words that frequently appear
+This method of combining word frequencies implicitly assumes that you're asking
-together. It's not multiplying the frequencies, because that would assume they
+about words that frequently appear together. It's not multiplying the
-are statistically unrelated. So if you give it an uncommon combination of
+frequencies, because that would assume they are statistically unrelated. So if
-tokens, it will hugely over-estimate their frequency:
+you give it an uncommon combination of tokens, it will hugely over-estimate
 their frequency:
    >>> word_frequency('owl-flavored', 'en')
    1.3557098723512335e-06
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -10,10 +10,29 @@ jieba_tokenizer = None
 def simplify_chinese(text):
    """
    Convert Chinese text character-by-character to Simplified Chinese, for the
    purpose of looking up word frequencies.
    This is far too simple to be a proper Chinese-to-Chinese "translation"; it
    will sometimes produce nonsense words by simplifying characters that would
    not be simplified in context, or by simplifying words that would only be
    used in a Traditional Chinese locale. But the resulting text is still a
    reasonable key for looking up word frequenices.
    """
    return text.translate(SIMPLIFIED_MAP).casefold()
 def jieba_tokenize(text):
    """
    Tokenize the given text into tokens whose word frequencies can probably
    be looked up. This uses Jieba, a word-frequency-based tokenizer.
    We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
    to infer unknown words using a hidden Markov model. This ensures that the
    multi-character tokens that it outputs will be ones whose word frequencies
    we can look up.
    """
    global jieba_tokenizer
    if jieba_tokenizer is None:
        jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -1,6 +1,5 @@
 import regex
 import unicodedata
 from pkg_resources import resource_filename
 TOKEN_RE = regex.compile(r"""