mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
Express the combining of word frequencies in an explicitly associative and commutative way.
Former-commit-id: 32b4033d63
This commit is contained in:
parent
ce364297a2
commit
5c72e68b7e
@ -1,7 +1,6 @@
|
|||||||
from wordfreq import (
|
from wordfreq import (
|
||||||
word_frequency, available_languages, cB_to_freq,
|
word_frequency, available_languages, cB_to_freq,
|
||||||
top_n_list, random_words, random_ascii_words, tokenize,
|
top_n_list, random_words, random_ascii_words, tokenize
|
||||||
half_harmonic_mean
|
|
||||||
)
|
)
|
||||||
from nose.tools import (
|
from nose.tools import (
|
||||||
eq_, assert_almost_equal, assert_greater, raises
|
eq_, assert_almost_equal, assert_greater, raises
|
||||||
@ -114,12 +113,9 @@ def test_phrase_freq():
|
|||||||
plant = word_frequency("plan.t", 'en')
|
plant = word_frequency("plan.t", 'en')
|
||||||
assert_greater(plant, 0)
|
assert_greater(plant, 0)
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
plant,
|
1.0 / plant,
|
||||||
half_harmonic_mean(
|
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
|
||||||
word_frequency('plan', 'en'),
|
)
|
||||||
word_frequency('t', 'en')
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_not_really_random():
|
def test_not_really_random():
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from nose.tools import eq_, assert_almost_equal
|
from nose.tools import eq_, assert_almost_equal
|
||||||
from wordfreq import tokenize, word_frequency, half_harmonic_mean
|
from wordfreq import tokenize, word_frequency
|
||||||
|
|
||||||
|
|
||||||
def test_tokens():
|
def test_tokens():
|
||||||
@ -17,10 +17,7 @@ def test_combination():
|
|||||||
ohayou_freq / 2
|
ohayou_freq / 2
|
||||||
)
|
)
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
word_frequency('おはようございます', 'ja'),
|
1.0 / word_frequency('おはようございます', 'ja'),
|
||||||
half_harmonic_mean(
|
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
||||||
half_harmonic_mean(ohayou_freq, gozai_freq),
|
|
||||||
masu_freq
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,30 +0,0 @@
|
|||||||
from nose.tools import assert_less_equal, assert_almost_equal
|
|
||||||
from wordfreq import half_harmonic_mean
|
|
||||||
from functools import reduce
|
|
||||||
import random
|
|
||||||
|
|
||||||
|
|
||||||
def check_hm_properties(inputs):
|
|
||||||
# I asserted that the half-harmonic-mean formula is associative,
|
|
||||||
# commutative, monotonic, and less than or equal to its inputs.
|
|
||||||
# (Less if its inputs are strictly positive, in fact.)
|
|
||||||
#
|
|
||||||
# So let's test that what I said is true.
|
|
||||||
hm1 = reduce(half_harmonic_mean, inputs)
|
|
||||||
random.shuffle(inputs)
|
|
||||||
hm2 = reduce(half_harmonic_mean, inputs)
|
|
||||||
assert_almost_equal(hm1, hm2)
|
|
||||||
|
|
||||||
inputs[0] *= 2
|
|
||||||
hm3 = reduce(half_harmonic_mean, inputs)
|
|
||||||
assert_less_equal(hm2, hm3)
|
|
||||||
|
|
||||||
|
|
||||||
def test_half_harmonic_mean():
|
|
||||||
for count in range(2, 6):
|
|
||||||
for rep in range(10):
|
|
||||||
# get some strictly positive arbitrary numbers
|
|
||||||
inputs = [random.expovariate(0.01)
|
|
||||||
for i in range(count)]
|
|
||||||
yield check_hm_properties, inputs
|
|
||||||
|
|
@ -209,42 +209,29 @@ def iter_wordlist(lang, wordlist='combined'):
|
|||||||
return itertools.chain(*get_frequency_list(lang, wordlist))
|
return itertools.chain(*get_frequency_list(lang, wordlist))
|
||||||
|
|
||||||
|
|
||||||
def half_harmonic_mean(a, b):
|
|
||||||
"""
|
|
||||||
An associative, commutative, monotonic function that returns a value
|
|
||||||
less than or equal to both a and b.
|
|
||||||
|
|
||||||
Used for estimating the frequency of terms made of multiple tokens, given
|
|
||||||
the assumption that the tokens very frequently appear together.
|
|
||||||
"""
|
|
||||||
return (a * b) / (a + b)
|
|
||||||
|
|
||||||
|
|
||||||
# This dict and inner function are used to implement a "drop everything" cache
|
# This dict and inner function are used to implement a "drop everything" cache
|
||||||
# for word_frequency(); the overheads of lru_cache() are comparable to the time
|
# for word_frequency(); the overheads of lru_cache() are comparable to the time
|
||||||
# it takes to look up frequencies from scratch, so something faster is needed.
|
# it takes to look up frequencies from scratch, so something faster is needed.
|
||||||
_wf_cache = {}
|
_wf_cache = {}
|
||||||
|
|
||||||
def _word_frequency(word, lang, wordlist, minimum):
|
def _word_frequency(word, lang, wordlist, minimum):
|
||||||
freqs = get_frequency_dict(lang, wordlist)
|
|
||||||
combined_value = None
|
|
||||||
tokens = tokenize(word, lang)
|
tokens = tokenize(word, lang)
|
||||||
|
if not tokens:
|
||||||
if len(tokens) == 0:
|
|
||||||
return minimum
|
return minimum
|
||||||
|
|
||||||
|
# Frequencies for multiple tokens are combined using the formula
|
||||||
|
# 1 / f = 1 / f1 + 1 / f2 + ...
|
||||||
|
# Thus the resulting frequency is less than any individual frequency, and
|
||||||
|
# the smallest frequency dominates the sum.
|
||||||
|
freqs = get_frequency_dict(lang, wordlist)
|
||||||
|
one_over_result = 0.0
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if token not in freqs:
|
if token not in freqs:
|
||||||
# If any word is missing, just return the default value
|
# If any word is missing, just return the default value
|
||||||
return minimum
|
return minimum
|
||||||
value = freqs[token]
|
one_over_result += 1.0 / freqs[token]
|
||||||
if combined_value is None:
|
|
||||||
combined_value = value
|
return max(1.0 / one_over_result, minimum)
|
||||||
else:
|
|
||||||
# Combine word values using the half-harmonic-mean formula,
|
|
||||||
# (a * b) / (a + b). This operation is associative.
|
|
||||||
combined_value = half_harmonic_mean(combined_value, value)
|
|
||||||
return max(combined_value, minimum)
|
|
||||||
|
|
||||||
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user