diff --git a/README.md b/README.md index b8c8dbb..6f8c000 100644 --- a/README.md +++ b/README.md @@ -192,14 +192,16 @@ into multiple tokens: 3.2187603965715087e-06 The word frequencies are combined with the half-harmonic-mean function in order -to provide an estimate of what their combined frequency would be. In languages -written without spaces, there is also a penalty to the word frequency for each -word break that must be inferred. +to provide an estimate of what their combined frequency would be. In Chinese, +where the word breaks must be inferred from the frequency of the resulting +words, there is also a penalty to the word frequency for each word break that +must be inferred. -This implicitly assumes that you're asking about words that frequently appear -together. It's not multiplying the frequencies, because that would assume they -are statistically unrelated. So if you give it an uncommon combination of -tokens, it will hugely over-estimate their frequency: +This method of combining word frequencies implicitly assumes that you're asking +about words that frequently appear together. It's not multiplying the +frequencies, because that would assume they are statistically unrelated. So if +you give it an uncommon combination of tokens, it will hugely over-estimate +their frequency: >>> word_frequency('owl-flavored', 'en') 1.3557098723512335e-06 diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index c07e77e..c923f83 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -10,10 +10,29 @@ jieba_tokenizer = None def simplify_chinese(text): + """ + Convert Chinese text character-by-character to Simplified Chinese, for the + purpose of looking up word frequencies. + + This is far too simple to be a proper Chinese-to-Chinese "translation"; it + will sometimes produce nonsense words by simplifying characters that would + not be simplified in context, or by simplifying words that would only be + used in a Traditional Chinese locale. But the resulting text is still a + reasonable key for looking up word frequenices. + """ return text.translate(SIMPLIFIED_MAP).casefold() def jieba_tokenize(text): + """ + Tokenize the given text into tokens whose word frequencies can probably + be looked up. This uses Jieba, a word-frequency-based tokenizer. + + We tell Jieba to default to using wordfreq's own Chinese wordlist, and not + to infer unknown words using a hidden Markov model. This ensures that the + multi-character tokens that it outputs will be ones whose word frequencies + we can look up. + """ global jieba_tokenizer if jieba_tokenizer is None: jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index b9c156c..c67c302 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -1,6 +1,5 @@ import regex import unicodedata -from pkg_resources import resource_filename TOKEN_RE = regex.compile(r"""