Merge branch 'master' into chinese-external-wordlist

Conflicts: wordfreq/chinese.py Former-commit-id: 1793c1bb2e
2024-12-23 17:31:41 +00:00 · 2015-09-28 14:34:59 -04:00 · 2015-09-28 14:34:59 -04:00 · c9693c9502
commit c9693c9502
parent 7494ae27a7 6d5ead0b47
3 changed files with 29 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -205,14 +205,16 @@ into multiple tokens:
    3.2187603965715087e-06

 The word frequencies are combined with the half-harmonic-mean function in order
-to provide an estimate of what their combined frequency would be. In languages
-written without spaces, there is also a penalty to the word frequency for each
-word break that must be inferred.
+to provide an estimate of what their combined frequency would be. In Chinese,
+where the word breaks must be inferred from the frequency of the resulting
+words, there is also a penalty to the word frequency for each word break that
+must be inferred.

-This implicitly assumes that you're asking about words that frequently appear
-together. It's not multiplying the frequencies, because that would assume they
-are statistically unrelated. So if you give it an uncommon combination of
-tokens, it will hugely over-estimate their frequency:
+This method of combining word frequencies implicitly assumes that you're asking
+about words that frequently appear together. It's not multiplying the
+frequencies, because that would assume they are statistically unrelated. So if
+you give it an uncommon combination of tokens, it will hugely over-estimate
+their frequency:

    >>> word_frequency('owl-flavored', 'en')
    1.3557098723512335e-06
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -12,21 +12,34 @@ jieba_orig_tokenizer = None


 def simplify_chinese(text):
+    """
+    Convert Chinese text character-by-character to Simplified Chinese, for the
+    purpose of looking up word frequencies.
+
+    This is far too simple to be a proper Chinese-to-Chinese "translation"; it
+    will sometimes produce nonsense words by simplifying characters that would
+    not be simplified in context, or by simplifying words that would only be
+    used in a Traditional Chinese locale. But the resulting text is still a
+    reasonable key for looking up word frequenices.
+    """
    return text.translate(SIMPLIFIED_MAP).casefold()


 def jieba_tokenize(text, external_wordlist=False):
    """
-    If `external_wordlist` is False, this will tokenize the given text with our
-    custom Jieba dictionary, which contains only the strings that have
-    frequencies in wordfreq.
+    Tokenize the given text into tokens whose word frequencies can probably
+    be looked up. This uses Jieba, a word-frequency-based tokenizer.

-    This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
-    the purpose of looking up frequencies, it's ideal.
+    If `external_wordlist` is False, we tell Jieba to default to using
+    wordfreq's own Chinese wordlist, and not to infer unknown words using a
+    hidden Markov model. This ensures that the multi-character tokens that it
+    outputs will be ones whose word frequencies we can look up.

    If `external_wordlist` is True, this will use the largest version of
-    Jieba's original dictionary, so its results will be independent of the
-    data in wordfreq.
+    Jieba's original dictionary, with HMM enabled, so its results will be
+    independent of the data in wordfreq. These results will be better optimized
+    for purposes that aren't looking up word frequencies, such as general-
+    purpose tokenization, or collecting word frequencies in the first place.
    """
    global jieba_tokenizer, jieba_orig_tokenizer
    if external_wordlist:
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -1,6 +1,5 @@
 import regex
 import unicodedata
-from pkg_resources import resource_filename


 TOKEN_RE = regex.compile(r"""