mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #29 from LuminosoInsight/code-review-notes-20150925
Fix documentation and clean up, based on Sep 25 code review
This commit is contained in:
commit
15d99be21b
16
README.md
16
README.md
@ -192,14 +192,16 @@ into multiple tokens:
|
||||
3.2187603965715087e-06
|
||||
|
||||
The word frequencies are combined with the half-harmonic-mean function in order
|
||||
to provide an estimate of what their combined frequency would be. In languages
|
||||
written without spaces, there is also a penalty to the word frequency for each
|
||||
word break that must be inferred.
|
||||
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||
where the word breaks must be inferred from the frequency of the resulting
|
||||
words, there is also a penalty to the word frequency for each word break that
|
||||
must be inferred.
|
||||
|
||||
This implicitly assumes that you're asking about words that frequently appear
|
||||
together. It's not multiplying the frequencies, because that would assume they
|
||||
are statistically unrelated. So if you give it an uncommon combination of
|
||||
tokens, it will hugely over-estimate their frequency:
|
||||
This method of combining word frequencies implicitly assumes that you're asking
|
||||
about words that frequently appear together. It's not multiplying the
|
||||
frequencies, because that would assume they are statistically unrelated. So if
|
||||
you give it an uncommon combination of tokens, it will hugely over-estimate
|
||||
their frequency:
|
||||
|
||||
>>> word_frequency('owl-flavored', 'en')
|
||||
1.3557098723512335e-06
|
||||
|
@ -10,10 +10,29 @@ jieba_tokenizer = None
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
"""
|
||||
Convert Chinese text character-by-character to Simplified Chinese, for the
|
||||
purpose of looking up word frequencies.
|
||||
|
||||
This is far too simple to be a proper Chinese-to-Chinese "translation"; it
|
||||
will sometimes produce nonsense words by simplifying characters that would
|
||||
not be simplified in context, or by simplifying words that would only be
|
||||
used in a Traditional Chinese locale. But the resulting text is still a
|
||||
reasonable key for looking up word frequenices.
|
||||
"""
|
||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||
|
||||
|
||||
def jieba_tokenize(text):
|
||||
"""
|
||||
Tokenize the given text into tokens whose word frequencies can probably
|
||||
be looked up. This uses Jieba, a word-frequency-based tokenizer.
|
||||
|
||||
We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
|
||||
to infer unknown words using a hidden Markov model. This ensures that the
|
||||
multi-character tokens that it outputs will be ones whose word frequencies
|
||||
we can look up.
|
||||
"""
|
||||
global jieba_tokenizer
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
|
@ -1,6 +1,5 @@
|
||||
import regex
|
||||
import unicodedata
|
||||
from pkg_resources import resource_filename
|
||||
|
||||
|
||||
TOKEN_RE = regex.compile(r"""
|
||||
|
Loading…
Reference in New Issue
Block a user