wordfreq/wordfreq/numbers.py
Elia Robyn Lake 981fab53aa add py.typed
2022-03-10 19:16:38 -05:00

113 lines
3.8 KiB
Python

import regex
# Frequencies of leading digits, according to Benford's law, sort of.
# Benford's law doesn't describe numbers with leading zeroes, because "007"
# and "7" are the same number, but for us they should have different frequencies.
# I added an estimate for the frequency of numbers with leading zeroes.
DIGIT_FREQS = [0.009, 0.300, 0.175, 0.124, 0.096, 0.078, 0.066, 0.057, 0.050, 0.045]
# Suppose you have a token NNNN, a 4-digit number representing a year. We're making
# a probability distribution of P(token=NNNN) | P(token is 4 digits).
#
# We do this with a piecewise exponential function whose peak is a plateau covering
# the years 2019 to 2039.
# Determined by experimentation: makes the probabilities of all years add up to 90%.
# The other 10% goes to NOT_YEAR_PROB. tests/test_numbers.py confirms that this
# probability distribution adds up to 1.
YEAR_LOG_PEAK = -1.9185
NOT_YEAR_PROB = 0.1
REFERENCE_YEAR = 2019
PLATEAU_WIDTH = 20
DIGIT_RE = regex.compile(r"\d")
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
PURE_DIGIT_RE = regex.compile(r"\d+")
def benford_freq(text: str) -> float:
"""
Estimate the frequency of a digit sequence according to Benford's law.
"""
first_digit = int(text[0])
return DIGIT_FREQS[first_digit] / 10 ** (len(text) - 1)
def year_freq(text: str) -> float:
"""
Estimate the relative frequency of a particular 4-digit sequence representing
a year.
For example, suppose text == "1985". We're estimating the probability that a
randomly-selected token from a large corpus will be "1985" and refer to the
year, _given_ that it is 4 digits. Tokens that are not 4 digits are not involved
in the probability distribution.
"""
year = int(text)
# Fitting a line to the curve seen at
# https://twitter.com/r_speer/status/1493715982887571456.
if year <= REFERENCE_YEAR:
year_log_freq = YEAR_LOG_PEAK - 0.0083 * (REFERENCE_YEAR - year)
# It's no longer 2019, which is when the Google Books data was last collected.
# It's 2022 as I write this, and possibly even later as you're using it. Years
# keep happening.
#
# So, we'll just keep the expected frequency of the "present" year constant for
# 20 years.
elif REFERENCE_YEAR < year <= REFERENCE_YEAR + PLATEAU_WIDTH:
year_log_freq = YEAR_LOG_PEAK
# Fall off quickly to catch up with the actual frequency of future years
# (it's low). This curve is made up to fit with the made-up "present" data above.
else:
year_log_freq = YEAR_LOG_PEAK - 0.2 * (year - (REFERENCE_YEAR + PLATEAU_WIDTH))
year_prob = 10.0**year_log_freq
# If this token _doesn't_ represent a year, then use the Benford frequency
# distribution.
not_year_prob = NOT_YEAR_PROB * benford_freq(text)
return year_prob + not_year_prob
def digit_freq(text: str) -> float:
"""
Get the relative frequency of a string of digits, using our estimates.
"""
freq = 1.0
for match in MULTI_DIGIT_RE.findall(text):
for submatch in PURE_DIGIT_RE.findall(match):
if len(submatch) == 4:
freq *= year_freq(submatch)
else:
freq *= benford_freq(submatch)
return freq
def has_digit_sequence(text: str) -> bool:
"""
Returns True iff the text has a digit sequence that will be normalized out
and handled with `digit_freq`.
"""
return bool(MULTI_DIGIT_RE.match(text))
def _sub_zeroes(match: regex.Match) -> str:
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub("0", match.group(0))
def smash_numbers(text: str) -> str:
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)