diff --git a/README.md b/README.md index ced4567..893b4c7 100644 --- a/README.md +++ b/README.md @@ -205,14 +205,16 @@ into multiple tokens: 3.2187603965715087e-06 The word frequencies are combined with the half-harmonic-mean function in order -to provide an estimate of what their combined frequency would be. In languages -written without spaces, there is also a penalty to the word frequency for each -word break that must be inferred. +to provide an estimate of what their combined frequency would be. In Chinese, +where the word breaks must be inferred from the frequency of the resulting +words, there is also a penalty to the word frequency for each word break that +must be inferred. -This implicitly assumes that you're asking about words that frequently appear -together. It's not multiplying the frequencies, because that would assume they -are statistically unrelated. So if you give it an uncommon combination of -tokens, it will hugely over-estimate their frequency: +This method of combining word frequencies implicitly assumes that you're asking +about words that frequently appear together. It's not multiplying the +frequencies, because that would assume they are statistically unrelated. So if +you give it an uncommon combination of tokens, it will hugely over-estimate +their frequency: >>> word_frequency('owl-flavored', 'en') 1.3557098723512335e-06 diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index be3c90b..c57e937 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -12,21 +12,34 @@ jieba_orig_tokenizer = None def simplify_chinese(text): + """ + Convert Chinese text character-by-character to Simplified Chinese, for the + purpose of looking up word frequencies. + + This is far too simple to be a proper Chinese-to-Chinese "translation"; it + will sometimes produce nonsense words by simplifying characters that would + not be simplified in context, or by simplifying words that would only be + used in a Traditional Chinese locale. But the resulting text is still a + reasonable key for looking up word frequenices. + """ return text.translate(SIMPLIFIED_MAP).casefold() def jieba_tokenize(text, external_wordlist=False): """ - If `external_wordlist` is False, this will tokenize the given text with our - custom Jieba dictionary, which contains only the strings that have - frequencies in wordfreq. + Tokenize the given text into tokens whose word frequencies can probably + be looked up. This uses Jieba, a word-frequency-based tokenizer. - This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for - the purpose of looking up frequencies, it's ideal. + If `external_wordlist` is False, we tell Jieba to default to using + wordfreq's own Chinese wordlist, and not to infer unknown words using a + hidden Markov model. This ensures that the multi-character tokens that it + outputs will be ones whose word frequencies we can look up. If `external_wordlist` is True, this will use the largest version of - Jieba's original dictionary, so its results will be independent of the - data in wordfreq. + Jieba's original dictionary, with HMM enabled, so its results will be + independent of the data in wordfreq. These results will be better optimized + for purposes that aren't looking up word frequencies, such as general- + purpose tokenization, or collecting word frequencies in the first place. """ global jieba_tokenizer, jieba_orig_tokenizer if external_wordlist: diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 65a9735..43dbfff 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -1,6 +1,5 @@ import regex import unicodedata -from pkg_resources import resource_filename TOKEN_RE = regex.compile(r"""