mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Express the combining of word frequencies in an explicitly associative and commutative way.
This commit is contained in:
parent
2821f23e79
commit
32b4033d63
@ -1,7 +1,6 @@
|
||||
from wordfreq import (
|
||||
word_frequency, available_languages, cB_to_freq,
|
||||
top_n_list, random_words, random_ascii_words, tokenize,
|
||||
half_harmonic_mean
|
||||
top_n_list, random_words, random_ascii_words, tokenize
|
||||
)
|
||||
from nose.tools import (
|
||||
eq_, assert_almost_equal, assert_greater, raises
|
||||
@ -114,12 +113,9 @@ def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
assert_almost_equal(
|
||||
plant,
|
||||
half_harmonic_mean(
|
||||
word_frequency('plan', 'en'),
|
||||
word_frequency('t', 'en')
|
||||
)
|
||||
)
|
||||
1.0 / plant,
|
||||
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
|
||||
)
|
||||
|
||||
|
||||
def test_not_really_random():
|
||||
|
@ -1,5 +1,5 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, word_frequency, half_harmonic_mean
|
||||
from wordfreq import tokenize, word_frequency
|
||||
|
||||
|
||||
def test_tokens():
|
||||
@ -17,10 +17,7 @@ def test_combination():
|
||||
ohayou_freq / 2
|
||||
)
|
||||
assert_almost_equal(
|
||||
word_frequency('おはようございます', 'ja'),
|
||||
half_harmonic_mean(
|
||||
half_harmonic_mean(ohayou_freq, gozai_freq),
|
||||
masu_freq
|
||||
)
|
||||
1.0 / word_frequency('おはようございます', 'ja'),
|
||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
||||
)
|
||||
|
||||
|
@ -1,30 +0,0 @@
|
||||
from nose.tools import assert_less_equal, assert_almost_equal
|
||||
from wordfreq import half_harmonic_mean
|
||||
from functools import reduce
|
||||
import random
|
||||
|
||||
|
||||
def check_hm_properties(inputs):
|
||||
# I asserted that the half-harmonic-mean formula is associative,
|
||||
# commutative, monotonic, and less than or equal to its inputs.
|
||||
# (Less if its inputs are strictly positive, in fact.)
|
||||
#
|
||||
# So let's test that what I said is true.
|
||||
hm1 = reduce(half_harmonic_mean, inputs)
|
||||
random.shuffle(inputs)
|
||||
hm2 = reduce(half_harmonic_mean, inputs)
|
||||
assert_almost_equal(hm1, hm2)
|
||||
|
||||
inputs[0] *= 2
|
||||
hm3 = reduce(half_harmonic_mean, inputs)
|
||||
assert_less_equal(hm2, hm3)
|
||||
|
||||
|
||||
def test_half_harmonic_mean():
|
||||
for count in range(2, 6):
|
||||
for rep in range(10):
|
||||
# get some strictly positive arbitrary numbers
|
||||
inputs = [random.expovariate(0.01)
|
||||
for i in range(count)]
|
||||
yield check_hm_properties, inputs
|
||||
|
@ -209,42 +209,29 @@ def iter_wordlist(lang, wordlist='combined'):
|
||||
return itertools.chain(*get_frequency_list(lang, wordlist))
|
||||
|
||||
|
||||
def half_harmonic_mean(a, b):
|
||||
"""
|
||||
An associative, commutative, monotonic function that returns a value
|
||||
less than or equal to both a and b.
|
||||
|
||||
Used for estimating the frequency of terms made of multiple tokens, given
|
||||
the assumption that the tokens very frequently appear together.
|
||||
"""
|
||||
return (a * b) / (a + b)
|
||||
|
||||
|
||||
# This dict and inner function are used to implement a "drop everything" cache
|
||||
# for word_frequency(); the overheads of lru_cache() are comparable to the time
|
||||
# it takes to look up frequencies from scratch, so something faster is needed.
|
||||
_wf_cache = {}
|
||||
|
||||
def _word_frequency(word, lang, wordlist, minimum):
|
||||
freqs = get_frequency_dict(lang, wordlist)
|
||||
combined_value = None
|
||||
tokens = tokenize(word, lang)
|
||||
|
||||
if len(tokens) == 0:
|
||||
if not tokens:
|
||||
return minimum
|
||||
|
||||
# Frequencies for multiple tokens are combined using the formula
|
||||
# 1 / f = 1 / f1 + 1 / f2 + ...
|
||||
# Thus the resulting frequency is less than any individual frequency, and
|
||||
# the smallest frequency dominates the sum.
|
||||
freqs = get_frequency_dict(lang, wordlist)
|
||||
one_over_result = 0.0
|
||||
for token in tokens:
|
||||
if token not in freqs:
|
||||
# If any word is missing, just return the default value
|
||||
return minimum
|
||||
value = freqs[token]
|
||||
if combined_value is None:
|
||||
combined_value = value
|
||||
else:
|
||||
# Combine word values using the half-harmonic-mean formula,
|
||||
# (a * b) / (a + b). This operation is associative.
|
||||
combined_value = half_harmonic_mean(combined_value, value)
|
||||
return max(combined_value, minimum)
|
||||
one_over_result += 1.0 / freqs[token]
|
||||
|
||||
return max(1.0 / one_over_result, minimum)
|
||||
|
||||
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user