mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Fix documentation and clean up, based on Sep 25 code review
This commit is contained in:
parent
cd0797e1c8
commit
44b0c4f9ba
16
README.md
16
README.md
@ -192,14 +192,16 @@ into multiple tokens:
|
|||||||
3.2187603965715087e-06
|
3.2187603965715087e-06
|
||||||
|
|
||||||
The word frequencies are combined with the half-harmonic-mean function in order
|
The word frequencies are combined with the half-harmonic-mean function in order
|
||||||
to provide an estimate of what their combined frequency would be. In languages
|
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||||
written without spaces, there is also a penalty to the word frequency for each
|
where the word breaks must be inferred from the frequency of the resulting
|
||||||
word break that must be inferred.
|
words, there is also a penalty to the word frequency for each word break that
|
||||||
|
must be inferred.
|
||||||
|
|
||||||
This implicitly assumes that you're asking about words that frequently appear
|
This method of combining word frequencies implicitly assumes that you're asking
|
||||||
together. It's not multiplying the frequencies, because that would assume they
|
about words that frequently appear together. It's not multiplying the
|
||||||
are statistically unrelated. So if you give it an uncommon combination of
|
frequencies, because that would assume they are statistically unrelated. So if
|
||||||
tokens, it will hugely over-estimate their frequency:
|
you give it an uncommon combination of tokens, it will hugely over-estimate
|
||||||
|
their frequency:
|
||||||
|
|
||||||
>>> word_frequency('owl-flavored', 'en')
|
>>> word_frequency('owl-flavored', 'en')
|
||||||
1.3557098723512335e-06
|
1.3557098723512335e-06
|
||||||
|
@ -10,10 +10,29 @@ jieba_tokenizer = None
|
|||||||
|
|
||||||
|
|
||||||
def simplify_chinese(text):
|
def simplify_chinese(text):
|
||||||
|
"""
|
||||||
|
Convert Chinese text character-by-character to Simplified Chinese, for the
|
||||||
|
purpose of looking up word frequencies.
|
||||||
|
|
||||||
|
This is far too simple to be a proper Chinese-to-Chinese "translation"; it
|
||||||
|
will sometimes produce nonsense words by simplifying characters that would
|
||||||
|
not be simplified in context, or by simplifying words that would only be
|
||||||
|
used in a Traditional Chinese locale. But the resulting text is still a
|
||||||
|
reasonable key for looking up word frequenices.
|
||||||
|
"""
|
||||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||||
|
|
||||||
|
|
||||||
def jieba_tokenize(text):
|
def jieba_tokenize(text):
|
||||||
|
"""
|
||||||
|
Tokenize the given text into tokens whose word frequencies can probably
|
||||||
|
be looked up. This uses Jieba, a word-frequency-based tokenizer.
|
||||||
|
|
||||||
|
We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
|
||||||
|
to infer unknown words using a hidden Markov model. This ensures that the
|
||||||
|
multi-character tokens that it outputs will be ones whose word frequencies
|
||||||
|
we can look up.
|
||||||
|
"""
|
||||||
global jieba_tokenizer
|
global jieba_tokenizer
|
||||||
if jieba_tokenizer is None:
|
if jieba_tokenizer is None:
|
||||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import regex
|
import regex
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from pkg_resources import resource_filename
|
|
||||||
|
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(r"""
|
TOKEN_RE = regex.compile(r"""
|
||||||
|
Loading…
Reference in New Issue
Block a user