mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
code review fixes to __init__
This commit is contained in:
parent
8656688b0b
commit
de81a23b9d
@ -18,12 +18,6 @@ logger = logging.getLogger(__name__)
|
|||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
# Chinese and Japanese are written without spaces. In Chinese, in particular,
|
|
||||||
# we have to infer word boundaries from the frequencies of the words they
|
|
||||||
# would create. When this happens, we should adjust the resulting frequency
|
|
||||||
# to avoid creating a bias toward improbable word combinations.
|
|
||||||
INFERRED_SPACE_LANGUAGES = {'zh'}
|
|
||||||
|
|
||||||
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
||||||
# (We determined the factor of 10 empirically by looking at words in the
|
# (We determined the factor of 10 empirically by looking at words in the
|
||||||
# Chinese wordlist that weren't common enough to be identified by the
|
# Chinese wordlist that weren't common enough to be identified by the
|
||||||
@ -269,6 +263,10 @@ def word_frequency(word, lang, wordlist='best', minimum=0.):
|
|||||||
- 'small': a wordlist built from at least 3 sources, containing word
|
- 'small': a wordlist built from at least 3 sources, containing word
|
||||||
frquencies of 10^-6 and higher
|
frquencies of 10^-6 and higher
|
||||||
- 'best': uses 'large' if available, and 'small' otherwise
|
- 'best': uses 'large' if available, and 'small' otherwise
|
||||||
|
|
||||||
|
The value returned will always be at least as large as `minimum`.
|
||||||
|
You could set this value to 10^-8, for example, to return 10^-8 for
|
||||||
|
unknown words in the 'large' list instead of 0, avoiding a discontinuity.
|
||||||
"""
|
"""
|
||||||
args = (word, lang, wordlist, minimum)
|
args = (word, lang, wordlist, minimum)
|
||||||
try:
|
try:
|
||||||
|
Loading…
Reference in New Issue
Block a user