mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
add Zipf scale
This commit is contained in:
parent
df8caaff7d
commit
9907948d11
@ -8,6 +8,7 @@ import itertools
|
||||
import pathlib
|
||||
import random
|
||||
import logging
|
||||
import math
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -146,6 +147,42 @@ def cB_to_freq(cB):
|
||||
return 10 ** (cB / 100)
|
||||
|
||||
|
||||
def cB_to_zipf(cB):
|
||||
"""
|
||||
Convert a word frequency from centibels to the Zipf scale
|
||||
(see `zipf_to_freq`).
|
||||
|
||||
The Zipf scale is related to centibels, the logarithmic unit that wordfreq
|
||||
uses internally, because the Zipf unit is simply the bel, with a different
|
||||
zero point. To convert centibels to Zipf, add 900 and divide by 100.
|
||||
"""
|
||||
return (cB + 900) / 100
|
||||
|
||||
|
||||
def zipf_to_freq(zipf):
|
||||
"""
|
||||
Convert a word frequency from the Zipf scale to a proportion between 0 and
|
||||
1.
|
||||
|
||||
The Zipf scale is a logarithmic frequency scale proposed by Marc Brysbaert,
|
||||
who compiled the SUBTLEX data. The goal of the Zipf scale is to map
|
||||
reasonable word frequencies to understandable, small positive numbers.
|
||||
|
||||
A word rates as x on the Zipf scale when it occurs 10**x times per billion
|
||||
words. For example, a word that occurs once per million words is at 3.0 on
|
||||
the Zipf scale.
|
||||
"""
|
||||
return 10 ** zipf / 1e9
|
||||
|
||||
|
||||
def freq_to_zipf(freq):
|
||||
"""
|
||||
Convert a word frequency from a proportion between 0 and 1 to the
|
||||
Zipf scale (see `zipf_to_freq`).
|
||||
"""
|
||||
return math.log(freq, 10) + 9
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
|
||||
"""
|
||||
@ -202,6 +239,7 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
|
||||
return max(freq, minimum)
|
||||
|
||||
|
||||
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
@ -240,6 +278,33 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
return _wf_cache[args]
|
||||
|
||||
|
||||
def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
"""
|
||||
Get the frequency of `word`, in the language with code `lang`, on the Zipf
|
||||
scale.
|
||||
|
||||
The Zipf scale is a logarithmic frequency scale proposed by Marc Brysbaert,
|
||||
who compiled the SUBTLEX data. The goal of the Zipf scale is to map
|
||||
reasonable word frequencies to understandable, small positive numbers.
|
||||
|
||||
A word rates as x on the Zipf scale when it occurs 10**x times per billion
|
||||
words. For example, a word that occurs once per million words is at 3.0 on
|
||||
the Zipf scale.
|
||||
|
||||
Zipf values for reasonable words are between 0 and 8. The value this
|
||||
function returns will always be at last as large as `minimum`, even for a
|
||||
word that never appears. The default minimum is 0, representing words
|
||||
that appear once per billion words or less.
|
||||
|
||||
wordfreq internally quantizes its frequencies to centibels, which are
|
||||
1/100 of a Zipf unit. The output of `zipf_frequency` will be rounded to
|
||||
the nearest hundredth to match this quantization.
|
||||
"""
|
||||
freq_min = zipf_to_freq(minimum)
|
||||
freq = word_frequency(word, lang, wordlist, freq_min)
|
||||
return round(freq_to_zipf(freq), 2)
|
||||
|
||||
|
||||
@lru_cache(maxsize=100)
|
||||
def top_n_list(lang, n, wordlist='combined', ascii_only=False):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user