Merge pull request #3 from LuminosoInsight/centibels

Switch to a centibel scale, add a header to the data
This commit is contained in:
Joshua Chin 2015-06-23 12:59:20 -04:00
commit 57579f0e56
4 changed files with 24 additions and 21 deletions

View File

@ -87,8 +87,8 @@ rule count
rule merge rule merge
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
rule freqs2dB rule freqs2cB
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
rule cat rule cat
command = cat $in > $out command = cat $in > $out

View File

@ -1,4 +1,4 @@
from wordfreq_builder.word_counts import freqs_to_dBpack from wordfreq_builder.word_counts import freqs_to_cBpack
import argparse import argparse
@ -7,5 +7,5 @@ if __name__ == '__main__':
parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file') parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args() args = parser.parse_args()
freqs_to_dBpack(args.filename_in, args.filename_out) freqs_to_cBpack(args.filename_in, args.filename_out)

View File

@ -205,11 +205,11 @@ def combine_lists(languages):
add_dep(lines, 'merge', input_files, output_file, add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py')
output_dBpack = wordlist_filename('combined', language, 'msgpack.gz') output_cBpack = wordlist_filename('combined', language, 'msgpack.gz')
add_dep(lines, 'freqs2dB', output_file, output_dBpack, add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py')
lines.append('default {}'.format(output_dBpack)) lines.append('default {}'.format(output_cBpack))
return lines return lines

View File

@ -50,30 +50,33 @@ def read_freqs(filename, cutoff=0):
return freqs return freqs
def freqs_to_dBpack(in_filename, out_filename, cutoff=-60): def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
""" """
Convert a csv file of words and their frequencies to a file in the Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'dBpack' format. idiosyncratic 'cBpack' format.
Only words with a frequency greater than `cutoff` dB will be written to Only words with a frequency greater than `cutoff` centibels will be
the new file. written to the new file.
""" """
freq_cutoff = 10 ** (cutoff / 10.) freq_cutoff = 10 ** (cutoff / 100.)
freqs = read_freqs(in_filename, freq_cutoff) freqs = read_freqs(in_filename, freq_cutoff)
dBpack = [] cBpack = []
for token, freq in freqs.items(): for token, freq in freqs.items():
dB = round(math.log10(freq) * 10) cB = round(math.log10(freq) * 100)
if dB >= cutoff: if cB >= cutoff:
neg_dB = -dB neg_cB = -cB
while neg_dB >= len(dBpack): while neg_cB >= len(cBpack):
dBpack.append([]) cBpack.append([])
dBpack[neg_dB].append(token) cBpack[neg_cB].append(token)
for sublist in dBpack: for sublist in cBpack:
sublist.sort() sublist.sort()
# Write a "header" consisting of a dictionary at the start of the file
cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack
with gzip.open(out_filename, 'wb') as outfile: with gzip.open(out_filename, 'wb') as outfile:
msgpack.dump(dBpack, outfile) msgpack.dump(cBpack_data, outfile)
def merge_freqs(freq_dicts): def merge_freqs(freq_dicts):