Switch to a faster 'drop everything' style cache for word_frequency().

2024-12-24 01:41:39 +00:00 · 2015-07-09 14:28:55 -04:00 · 2015-07-09 14:28:55 -04:00 · 2821f23e79
commit 2821f23e79
parent b4bae84aac
1 changed files with 34 additions and 20 deletions
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -220,7 +220,32 @@ def half_harmonic_mean(a, b):
    return (a * b) / (a + b)
-@lru_cache(maxsize=CACHE_SIZE)
+# This dict and inner function are used to implement a "drop everything" cache
 # for word_frequency(); the overheads of lru_cache() are comparable to the time
 # it takes to look up frequencies from scratch, so something faster is needed.
 _wf_cache = {}
 def _word_frequency(word, lang, wordlist, minimum):
    freqs = get_frequency_dict(lang, wordlist)
    combined_value = None
    tokens = tokenize(word, lang)
    if len(tokens) == 0:
        return minimum
    for token in tokens:
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
        value = freqs[token]
        if combined_value is None:
            combined_value = value
        else:
            # Combine word values using the half-harmonic-mean formula,
            # (a * b) / (a + b). This operation is associative.
            combined_value = half_harmonic_mean(combined_value, value)
    return max(combined_value, minimum)
 def word_frequency(word, lang, wordlist='combined', minimum=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
@ -246,25 +271,14 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
    of the word frequency that is no greater than the frequency of any of its
    individual tokens.
    """
-    freqs = get_frequency_dict(lang, wordlist)
+    args = (word, lang, wordlist, minimum)
-    combined_value = None
+    try:
-    tokens = tokenize(word, lang)
+        return _wf_cache[args]
-
+    except KeyError:
-    if len(tokens) == 0:
+        if len(_wf_cache) >= CACHE_SIZE:
-        return minimum
+            _wf_cache.clear()
-
+        _wf_cache[args] = _word_frequency(*args)
-    for token in tokens:
+        return _wf_cache[args]
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
        value = freqs[token]
        if combined_value is None:
            combined_value = value
        else:
            # Combine word values using the half-harmonic-mean formula,
            # (a * b) / (a + b). This operation is associative.
            combined_value = half_harmonic_mean(combined_value, value)
    return max(combined_value, minimum)
@lru_cache(maxsize=100)