Express the combining of word frequencies in an explicitly associative and commutative way.

Former-commit-id: 32b4033d63
This commit is contained in:
Andrew Lin 2015-07-09 15:26:54 -04:00
parent ce364297a2
commit 5c72e68b7e
4 changed files with 17 additions and 67 deletions

View File

@ -1,7 +1,6 @@
from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize,
half_harmonic_mean
top_n_list, random_words, random_ascii_words, tokenize
)
from nose.tools import (
eq_, assert_almost_equal, assert_greater, raises
@ -114,11 +113,8 @@ def test_phrase_freq():
plant = word_frequency("plan.t", 'en')
assert_greater(plant, 0)
assert_almost_equal(
plant,
half_harmonic_mean(
word_frequency('plan', 'en'),
word_frequency('t', 'en')
)
1.0 / plant,
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
)

View File

@ -1,5 +1,5 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency, half_harmonic_mean
from wordfreq import tokenize, word_frequency
def test_tokens():
@ -17,10 +17,7 @@ def test_combination():
ohayou_freq / 2
)
assert_almost_equal(
word_frequency('おはようございます', 'ja'),
half_harmonic_mean(
half_harmonic_mean(ohayou_freq, gozai_freq),
masu_freq
)
1.0 / word_frequency('おはようございます', 'ja'),
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
)

View File

@ -1,30 +0,0 @@
from nose.tools import assert_less_equal, assert_almost_equal
from wordfreq import half_harmonic_mean
from functools import reduce
import random
def check_hm_properties(inputs):
# I asserted that the half-harmonic-mean formula is associative,
# commutative, monotonic, and less than or equal to its inputs.
# (Less if its inputs are strictly positive, in fact.)
#
# So let's test that what I said is true.
hm1 = reduce(half_harmonic_mean, inputs)
random.shuffle(inputs)
hm2 = reduce(half_harmonic_mean, inputs)
assert_almost_equal(hm1, hm2)
inputs[0] *= 2
hm3 = reduce(half_harmonic_mean, inputs)
assert_less_equal(hm2, hm3)
def test_half_harmonic_mean():
for count in range(2, 6):
for rep in range(10):
# get some strictly positive arbitrary numbers
inputs = [random.expovariate(0.01)
for i in range(count)]
yield check_hm_properties, inputs

View File

@ -209,42 +209,29 @@ def iter_wordlist(lang, wordlist='combined'):
return itertools.chain(*get_frequency_list(lang, wordlist))
def half_harmonic_mean(a, b):
"""
An associative, commutative, monotonic function that returns a value
less than or equal to both a and b.
Used for estimating the frequency of terms made of multiple tokens, given
the assumption that the tokens very frequently appear together.
"""
return (a * b) / (a + b)
# This dict and inner function are used to implement a "drop everything" cache
# for word_frequency(); the overheads of lru_cache() are comparable to the time
# it takes to look up frequencies from scratch, so something faster is needed.
_wf_cache = {}
def _word_frequency(word, lang, wordlist, minimum):
freqs = get_frequency_dict(lang, wordlist)
combined_value = None
tokens = tokenize(word, lang)
if len(tokens) == 0:
if not tokens:
return minimum
# Frequencies for multiple tokens are combined using the formula
# 1 / f = 1 / f1 + 1 / f2 + ...
# Thus the resulting frequency is less than any individual frequency, and
# the smallest frequency dominates the sum.
freqs = get_frequency_dict(lang, wordlist)
one_over_result = 0.0
for token in tokens:
if token not in freqs:
# If any word is missing, just return the default value
return minimum
value = freqs[token]
if combined_value is None:
combined_value = value
else:
# Combine word values using the half-harmonic-mean formula,
# (a * b) / (a + b). This operation is associative.
combined_value = half_harmonic_mean(combined_value, value)
return max(combined_value, minimum)
one_over_result += 1.0 / freqs[token]
return max(1.0 / one_over_result, minimum)
def word_frequency(word, lang, wordlist='combined', minimum=0.):
"""