add Zipf scale

This commit is contained in:
Rob Speer 2016-01-21 14:07:01 -05:00
parent df8caaff7d
commit 9907948d11

View File

@ -8,6 +8,7 @@ import itertools
import pathlib import pathlib
import random import random
import logging import logging
import math
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -146,6 +147,42 @@ def cB_to_freq(cB):
return 10 ** (cB / 100) return 10 ** (cB / 100)
def cB_to_zipf(cB):
"""
Convert a word frequency from centibels to the Zipf scale
(see `zipf_to_freq`).
The Zipf scale is related to centibels, the logarithmic unit that wordfreq
uses internally, because the Zipf unit is simply the bel, with a different
zero point. To convert centibels to Zipf, add 900 and divide by 100.
"""
return (cB + 900) / 100
def zipf_to_freq(zipf):
"""
Convert a word frequency from the Zipf scale to a proportion between 0 and
1.
The Zipf scale is a logarithmic frequency scale proposed by Marc Brysbaert,
who compiled the SUBTLEX data. The goal of the Zipf scale is to map
reasonable word frequencies to understandable, small positive numbers.
A word rates as x on the Zipf scale when it occurs 10**x times per billion
words. For example, a word that occurs once per million words is at 3.0 on
the Zipf scale.
"""
return 10 ** zipf / 1e9
def freq_to_zipf(freq):
"""
Convert a word frequency from a proportion between 0 and 1 to the
Zipf scale (see `zipf_to_freq`).
"""
return math.log(freq, 10) + 9
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='combined', match_cutoff=30): def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
""" """
@ -202,6 +239,7 @@ def _word_frequency(word, lang, wordlist, minimum):
return max(freq, minimum) return max(freq, minimum)
def word_frequency(word, lang, wordlist='combined', minimum=0.): def word_frequency(word, lang, wordlist='combined', minimum=0.):
""" """
Get the frequency of `word` in the language with code `lang`, from the Get the frequency of `word` in the language with code `lang`, from the
@ -240,6 +278,33 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
return _wf_cache[args] return _wf_cache[args]
def zipf_frequency(word, lang, wordlist='combined', minimum=0.):
"""
Get the frequency of `word`, in the language with code `lang`, on the Zipf
scale.
The Zipf scale is a logarithmic frequency scale proposed by Marc Brysbaert,
who compiled the SUBTLEX data. The goal of the Zipf scale is to map
reasonable word frequencies to understandable, small positive numbers.
A word rates as x on the Zipf scale when it occurs 10**x times per billion
words. For example, a word that occurs once per million words is at 3.0 on
the Zipf scale.
Zipf values for reasonable words are between 0 and 8. The value this
function returns will always be at last as large as `minimum`, even for a
word that never appears. The default minimum is 0, representing words
that appear once per billion words or less.
wordfreq internally quantizes its frequencies to centibels, which are
1/100 of a Zipf unit. The output of `zipf_frequency` will be rounded to
the nearest hundredth to match this quantization.
"""
freq_min = zipf_to_freq(minimum)
freq = word_frequency(word, lang, wordlist, freq_min)
return round(freq_to_zipf(freq), 2)
@lru_cache(maxsize=100) @lru_cache(maxsize=100)
def top_n_list(lang, n, wordlist='combined', ascii_only=False): def top_n_list(lang, n, wordlist='combined', ascii_only=False):
""" """