Switch to a faster 'drop everything' style cache for word_frequency().

This commit is contained in:
Andrew Lin 2015-07-09 14:28:55 -04:00
parent b4bae84aac
commit 2821f23e79

View File

@ -220,7 +220,32 @@ def half_harmonic_mean(a, b):
return (a * b) / (a + b) return (a * b) / (a + b)
@lru_cache(maxsize=CACHE_SIZE) # This dict and inner function are used to implement a "drop everything" cache
# for word_frequency(); the overheads of lru_cache() are comparable to the time
# it takes to look up frequencies from scratch, so something faster is needed.
_wf_cache = {}
def _word_frequency(word, lang, wordlist, minimum):
freqs = get_frequency_dict(lang, wordlist)
combined_value = None
tokens = tokenize(word, lang)
if len(tokens) == 0:
return minimum
for token in tokens:
if token not in freqs:
# If any word is missing, just return the default value
return minimum
value = freqs[token]
if combined_value is None:
combined_value = value
else:
# Combine word values using the half-harmonic-mean formula,
# (a * b) / (a + b). This operation is associative.
combined_value = half_harmonic_mean(combined_value, value)
return max(combined_value, minimum)
def word_frequency(word, lang, wordlist='combined', minimum=0.): def word_frequency(word, lang, wordlist='combined', minimum=0.):
""" """
Get the frequency of `word` in the language with code `lang`, from the Get the frequency of `word` in the language with code `lang`, from the
@ -246,25 +271,14 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
of the word frequency that is no greater than the frequency of any of its of the word frequency that is no greater than the frequency of any of its
individual tokens. individual tokens.
""" """
freqs = get_frequency_dict(lang, wordlist) args = (word, lang, wordlist, minimum)
combined_value = None try:
tokens = tokenize(word, lang) return _wf_cache[args]
except KeyError:
if len(tokens) == 0: if len(_wf_cache) >= CACHE_SIZE:
return minimum _wf_cache.clear()
_wf_cache[args] = _word_frequency(*args)
for token in tokens: return _wf_cache[args]
if token not in freqs:
# If any word is missing, just return the default value
return minimum
value = freqs[token]
if combined_value is None:
combined_value = value
else:
# Combine word values using the half-harmonic-mean formula,
# (a * b) / (a + b). This operation is associative.
combined_value = half_harmonic_mean(combined_value, value)
return max(combined_value, minimum)
@lru_cache(maxsize=100) @lru_cache(maxsize=100)