From f305679caf818d3d2b621759be88c3d360487d29 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Mon, 22 Jun 2015 17:38:13 -0400
Subject: [PATCH] Switch to a centibel scale, add a header to the data

---
 wordfreq_builder/rules.ninja                  |  4 +--
 .../cli/{freqs_to_dB.py => freqs_to_cB.py}    |  4 +--
 wordfreq_builder/wordfreq_builder/ninja.py    |  6 ++--
 .../wordfreq_builder/word_counts.py           | 31 ++++++++++---------
 4 files changed, 24 insertions(+), 21 deletions(-)
 rename wordfreq_builder/wordfreq_builder/cli/{freqs_to_dB.py => freqs_to_cB.py} (70%)

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index d8017e8..0c72cfe 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -87,8 +87,8 @@ rule count
 rule merge
   command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
 
-rule freqs2dB
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out
+rule freqs2cB
+  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
 
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
similarity index 70%
rename from wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
rename to wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
index 81a4dde..6bf3957 100644
--- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
@@ -1,4 +1,4 @@
-from wordfreq_builder.word_counts import freqs_to_dBpack
+from wordfreq_builder.word_counts import freqs_to_cBpack
 import argparse
 
 
@@ -7,5 +7,5 @@ if __name__ == '__main__':
     parser.add_argument('filename_in', help='name of input file containing tokens')
     parser.add_argument('filename_out', help='name of output file')
     args = parser.parse_args()
-    freqs_to_dBpack(args.filename_in, args.filename_out)
+    freqs_to_cBpack(args.filename_in, args.filename_out)
 
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index e691bf4..04d3df3 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -205,11 +205,11 @@ def combine_lists(languages):
         add_dep(lines, 'merge', input_files, output_file,
                 extra='wordfreq_builder/word_counts.py')
 
-        output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
-        add_dep(lines, 'freqs2dB', output_file, output_dBpack,
+        output_cBpack = wordlist_filename('combined', language, 'msgpack.gz')
+        add_dep(lines, 'freqs2cB', output_file, output_cBpack,
                 extra='wordfreq_builder/word_counts.py')
 
-        lines.append('default {}'.format(output_dBpack))
+        lines.append('default {}'.format(output_cBpack))
     return lines
 
 
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 745e355..cc4c3a5 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -50,30 +50,33 @@ def read_freqs(filename, cutoff=0):
     return freqs
 
 
-def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
+def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
     """
     Convert a csv file of words and their frequencies to a file in the
-    idiosyncratic 'dBpack' format.
+    idiosyncratic 'cBpack' format.
 
-    Only words with a frequency greater than `cutoff` dB will be written to
-    the new file.
+    Only words with a frequency greater than `cutoff` centibels will be
+    written to the new file.
     """
-    freq_cutoff = 10 ** (cutoff / 10.)
+    freq_cutoff = 10 ** (cutoff / 100.)
     freqs = read_freqs(in_filename, freq_cutoff)
-    dBpack = []
+    cBpack = []
     for token, freq in freqs.items():
-        dB = round(math.log10(freq) * 10)
-        if dB >= cutoff:
-            neg_dB = -dB
-            while neg_dB >= len(dBpack):
-                dBpack.append([])
-            dBpack[neg_dB].append(token)
+        cB = round(math.log10(freq) * 100)
+        if cB >= cutoff:
+            neg_cB = -cB
+            while neg_cB >= len(cBpack):
+                cBpack.append([])
+            cBpack[neg_cB].append(token)
 
-    for sublist in dBpack:
+    for sublist in cBpack:
         sublist.sort()
 
+    # Write a "header" consisting of a dictionary at the start of the file
+    cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack
+
     with gzip.open(out_filename, 'wb') as outfile:
-        msgpack.dump(dBpack, outfile)
+        msgpack.dump(cBpack_data, outfile)
 
 
 def merge_freqs(freq_dicts):