From f305679caf818d3d2b621759be88c3d360487d29 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Mon, 22 Jun 2015 17:38:13 -0400 Subject: [PATCH] Switch to a centibel scale, add a header to the data --- wordfreq_builder/rules.ninja | 4 +-- .../cli/{freqs_to_dB.py => freqs_to_cB.py} | 4 +-- wordfreq_builder/wordfreq_builder/ninja.py | 6 ++-- .../wordfreq_builder/word_counts.py | 31 ++++++++++--------- 4 files changed, 24 insertions(+), 21 deletions(-) rename wordfreq_builder/wordfreq_builder/cli/{freqs_to_dB.py => freqs_to_cB.py} (70%) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index d8017e8..0c72cfe 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -87,8 +87,8 @@ rule count rule merge command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in -rule freqs2dB - command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out +rule freqs2cB + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out rule cat command = cat $in > $out diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py similarity index 70% rename from wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py rename to wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py index 81a4dde..6bf3957 100644 --- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py @@ -1,4 +1,4 @@ -from wordfreq_builder.word_counts import freqs_to_dBpack +from wordfreq_builder.word_counts import freqs_to_cBpack import argparse @@ -7,5 +7,5 @@ if __name__ == '__main__': parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_out', help='name of output file') args = parser.parse_args() - freqs_to_dBpack(args.filename_in, args.filename_out) + freqs_to_cBpack(args.filename_in, args.filename_out) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index e691bf4..04d3df3 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -205,11 +205,11 @@ def combine_lists(languages): add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py') - output_dBpack = wordlist_filename('combined', language, 'msgpack.gz') - add_dep(lines, 'freqs2dB', output_file, output_dBpack, + output_cBpack = wordlist_filename('combined', language, 'msgpack.gz') + add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py') - lines.append('default {}'.format(output_dBpack)) + lines.append('default {}'.format(output_cBpack)) return lines diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 745e355..cc4c3a5 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -50,30 +50,33 @@ def read_freqs(filename, cutoff=0): return freqs -def freqs_to_dBpack(in_filename, out_filename, cutoff=-60): +def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): """ Convert a csv file of words and their frequencies to a file in the - idiosyncratic 'dBpack' format. + idiosyncratic 'cBpack' format. - Only words with a frequency greater than `cutoff` dB will be written to - the new file. + Only words with a frequency greater than `cutoff` centibels will be + written to the new file. """ - freq_cutoff = 10 ** (cutoff / 10.) + freq_cutoff = 10 ** (cutoff / 100.) freqs = read_freqs(in_filename, freq_cutoff) - dBpack = [] + cBpack = [] for token, freq in freqs.items(): - dB = round(math.log10(freq) * 10) - if dB >= cutoff: - neg_dB = -dB - while neg_dB >= len(dBpack): - dBpack.append([]) - dBpack[neg_dB].append(token) + cB = round(math.log10(freq) * 100) + if cB >= cutoff: + neg_cB = -cB + while neg_cB >= len(cBpack): + cBpack.append([]) + cBpack[neg_cB].append(token) - for sublist in dBpack: + for sublist in cBpack: sublist.sort() + # Write a "header" consisting of a dictionary at the start of the file + cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack + with gzip.open(out_filename, 'wb') as outfile: - msgpack.dump(dBpack, outfile) + msgpack.dump(cBpack_data, outfile) def merge_freqs(freq_dicts):