Merge pull request #9 from LuminosoInsight/centibels

Switch to a more precise centibel scale
2024-12-23 17:31:41 +00:00 · 2015-06-23 12:55:55 -04:00 · 2015-06-23 12:55:55 -04:00 · 0cac6dfda1
commit 0cac6dfda1
parent 28c9edf135 d81070a13c
17 changed files with 54 additions and 34 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -1,5 +1,5 @@
 from wordfreq import (
-    word_frequency, available_languages, dB_to_freq, iter_wordlist,
+    word_frequency, available_languages, cB_to_freq, iter_wordlist,
    top_n_list, random_words, random_ascii_words
 )
 from nose.tools import (
@ -48,7 +48,7 @@ def test_most_common_words():
        return top_n_list(lang, 1)[0]

    eq_(get_most_common('ar'), 'في')
-    eq_(get_most_common('de'), 'der')
+    eq_(get_most_common('de'), 'die')
    eq_(get_most_common('en'), 'the')
    eq_(get_most_common('es'), 'de')
    eq_(get_most_common('fr'), 'de')
@ -70,15 +70,15 @@ def test_language_matching():
    eq_(word_frequency('的', 'cmn'), freq)


-def test_dB_conversion():
-    eq_(dB_to_freq(0), 1.)
-    assert_almost_equal(dB_to_freq(-10), 0.1)
-    assert_almost_equal(dB_to_freq(-60), 1e-6)
+def test_cB_conversion():
+    eq_(cB_to_freq(0), 1.)
+    assert_almost_equal(cB_to_freq(-100), 0.1)
+    assert_almost_equal(cB_to_freq(-600), 1e-6)


@raises(ValueError)
-def test_failed_dB_conversion():
-    dB_to_freq(1)
+def test_failed_cB_conversion():
+    cB_to_freq(1)


 def test_tokenization():
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -53,36 +53,56 @@ def tokenize(text, lang):
        return simple_tokenize(text)


-def read_dBpack(filename):
+def read_cBpack(filename):
    """
    Read a file from an idiosyncratic format that we use for storing
-    approximate word frequencies, called "dBpack".
+    approximate word frequencies, called "cBpack".

-    The dBpack format is as follows:
+    The cBpack format is as follows:

    - The file on disk is a gzipped file in msgpack format, which decodes to a
-      list of lists of words.
+      list whose first element is a header, and whose remaining elements are
+      lists of words.
+
+    - The header is a dictionary with 'format' and 'version' keys that make
+      sure that we're reading the right thing.

    - Each inner list of words corresponds to a particular word frequency,
-      rounded to the nearest decibel. 0 dB represents a word that occurs with
-      probability 1, so it is the only word in the data (this of course doesn't
-      happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB
-      represents a word that occurs once per 1000 tokens, and so on.
+      rounded to the nearest centibel -- that is, one tenth of a decibel, or
+      a factor of 10 ** .01.

-    - The index of each list within the overall list is the negative of its
-      frequency in decibels.
+      0 cB represents a word that occurs with probability 1, so it is the only
+      word in the data (this of course doesn't happen). -200 cB represents a
+      word that occurs once per 100 tokens, -300 cB represents a word that
+      occurs once per 1000 tokens, and so on.
+
+    - The index of each list within the overall list (without the header) is
+      the negative of its frequency in centibels.

    - Each inner list is sorted in alphabetical order.

    As an example, consider a corpus consisting only of the words "red fish
-    blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red"
-    and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word
-    frequencies would decode to this list:
+    blue fish". The word "fish" occurs as 50% of tokens (-30 cB), while "red"
+    and "blue" occur as 25% of tokens (-60 cB). The cBpack file of their word
+    frequencies would decode to this:

-        [[], [], [], ['fish'], [], [], ['blue', 'red']]
+        [
+            {'format': 'cB', 'version': 1},
+            [], [], [], ...    # 30 empty lists
+            ['fish'],
+            [], [], [], ...    # 29 more empty lists
+            ['blue', 'red']
+        ]
    """
    with gzip.open(filename, 'rb') as infile:
-        return msgpack.load(infile, encoding='utf-8')
+        data = msgpack.load(infile, encoding='utf-8')
+        header = data[0]
+        if (
+            not isinstance(header, dict) or header.get('format') != 'cB'
+            or header.get('version') != 1
+        ):
+            raise ValueError("Unexpected header: %r" % header)
+        return data[1:]


 def available_languages(wordlist='combined'):
@ -103,7 +123,7 @@ def available_languages(wordlist='combined'):
 def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
    """
    Read the raw data from a wordlist file, returning it as a list of
-    lists. (See `read_dBpack` for what this represents.)
+    lists. (See `read_cBpack` for what this represents.)

    Because we use the `langcodes` module, we can handle slight
    variations in language codes. For example, looking for 'pt-BR',
@ -123,25 +143,25 @@ def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
            % (lang, best, langcodes.get(best).language_name('en'))
        )

-    return read_dBpack(available[best])
+    return read_cBpack(available[best])


-def dB_to_freq(dB):
+def cB_to_freq(cB):
    """
-    Convert a word frequency from the logarithmic decibel scale that we use
+    Convert a word frequency from the logarithmic centibel scale that we use
    internally, to a proportion from 0 to 1.

-    On this scale, 0 dB represents the maximum possible frequency of
-    1.0. -10 dB represents a word that happens 1 in 10 times,
-    -20 dB represents something that happens 1 in 100 times, and so on.
+    On this scale, 0 cB represents the maximum possible frequency of
+    1.0. -100 cB represents a word that happens 1 in 10 times,
+    -200 cB represents something that happens 1 in 100 times, and so on.

-    In general, x dB represents a frequency of 10 ** (x/10).
+    In general, x cB represents a frequency of 10 ** (x/100).
    """
-    if dB > 0:
+    if cB > 0:
        raise ValueError(
            "A frequency cannot be a positive number of decibels."
        )
-    return 10 ** (dB / 10)
+    return 10 ** (cB / 100)


@lru_cache(maxsize=None)
@ -154,7 +174,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
    pack = get_frequency_list(lang, wordlist, match_cutoff)
    for index, bucket in enumerate(pack):
        for word in bucket:
-            freqs[word] = dB_to_freq(-index)
+            freqs[word] = cB_to_freq(-index)
    return freqs


--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz