mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
Merge pull request #3 from LuminosoInsight/centibels
Switch to a centibel scale, add a header to the data
This commit is contained in:
commit
57579f0e56
@ -87,8 +87,8 @@ rule count
|
|||||||
rule merge
|
rule merge
|
||||||
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
|
||||||
|
|
||||||
rule freqs2dB
|
rule freqs2cB
|
||||||
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||||
|
|
||||||
rule cat
|
rule cat
|
||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq_builder.word_counts import freqs_to_dBpack
|
from wordfreq_builder.word_counts import freqs_to_cBpack
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
@ -7,5 +7,5 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||||
parser.add_argument('filename_out', help='name of output file')
|
parser.add_argument('filename_out', help='name of output file')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
freqs_to_dBpack(args.filename_in, args.filename_out)
|
freqs_to_cBpack(args.filename_in, args.filename_out)
|
||||||
|
|
@ -205,11 +205,11 @@ def combine_lists(languages):
|
|||||||
add_dep(lines, 'merge', input_files, output_file,
|
add_dep(lines, 'merge', input_files, output_file,
|
||||||
extra='wordfreq_builder/word_counts.py')
|
extra='wordfreq_builder/word_counts.py')
|
||||||
|
|
||||||
output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
|
output_cBpack = wordlist_filename('combined', language, 'msgpack.gz')
|
||||||
add_dep(lines, 'freqs2dB', output_file, output_dBpack,
|
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||||
extra='wordfreq_builder/word_counts.py')
|
extra='wordfreq_builder/word_counts.py')
|
||||||
|
|
||||||
lines.append('default {}'.format(output_dBpack))
|
lines.append('default {}'.format(output_cBpack))
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,30 +50,33 @@ def read_freqs(filename, cutoff=0):
|
|||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
|
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
||||||
"""
|
"""
|
||||||
Convert a csv file of words and their frequencies to a file in the
|
Convert a csv file of words and their frequencies to a file in the
|
||||||
idiosyncratic 'dBpack' format.
|
idiosyncratic 'cBpack' format.
|
||||||
|
|
||||||
Only words with a frequency greater than `cutoff` dB will be written to
|
Only words with a frequency greater than `cutoff` centibels will be
|
||||||
the new file.
|
written to the new file.
|
||||||
"""
|
"""
|
||||||
freq_cutoff = 10 ** (cutoff / 10.)
|
freq_cutoff = 10 ** (cutoff / 100.)
|
||||||
freqs = read_freqs(in_filename, freq_cutoff)
|
freqs = read_freqs(in_filename, freq_cutoff)
|
||||||
dBpack = []
|
cBpack = []
|
||||||
for token, freq in freqs.items():
|
for token, freq in freqs.items():
|
||||||
dB = round(math.log10(freq) * 10)
|
cB = round(math.log10(freq) * 100)
|
||||||
if dB >= cutoff:
|
if cB >= cutoff:
|
||||||
neg_dB = -dB
|
neg_cB = -cB
|
||||||
while neg_dB >= len(dBpack):
|
while neg_cB >= len(cBpack):
|
||||||
dBpack.append([])
|
cBpack.append([])
|
||||||
dBpack[neg_dB].append(token)
|
cBpack[neg_cB].append(token)
|
||||||
|
|
||||||
for sublist in dBpack:
|
for sublist in cBpack:
|
||||||
sublist.sort()
|
sublist.sort()
|
||||||
|
|
||||||
|
# Write a "header" consisting of a dictionary at the start of the file
|
||||||
|
cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack
|
||||||
|
|
||||||
with gzip.open(out_filename, 'wb') as outfile:
|
with gzip.open(out_filename, 'wb') as outfile:
|
||||||
msgpack.dump(dBpack, outfile)
|
msgpack.dump(cBpack_data, outfile)
|
||||||
|
|
||||||
|
|
||||||
def merge_freqs(freq_dicts):
|
def merge_freqs(freq_dicts):
|
||||||
|
Loading…
Reference in New Issue
Block a user