builder: Use an optional cutoff when merging counts

This allows the Reddit-merging step to not use such a ludicrous amount
of memory.
This commit is contained in:
Rob Speer 2015-12-15 14:44:34 -05:00
parent 9a5d9d66bb
commit 973caca253
3 changed files with 15 additions and 7 deletions

View File

@ -92,7 +92,7 @@ rule merge
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
rule merge_counts rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out $in command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
rule freqs2cB rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out command = python -m wordfreq_builder.cli.freqs_to_cB $in $out

View File

@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
import argparse import argparse
def merge_lists(input_names, output_name): def merge_lists(input_names, output_name, cutoff=0):
count_dicts = [] count_dicts = []
for input_name in input_names: for input_name in input_names:
values, total = read_values(input_name, cutoff=0, max_size=1000000) values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
count_dicts.append(values) count_dicts.append(values)
merged = merge_counts(count_dicts) merged = merge_counts(count_dicts)
write_wordlist(merged, output_name) write_wordlist(merged, output_name)
@ -14,7 +14,8 @@ def merge_lists(input_names, output_name):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
parser.add_argument('inputs', help='names of input files to merge', nargs='+') parser.add_argument('inputs', help='names of input files to merge', nargs='+')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output) merge_lists(args.inputs, args.output, cutoff=args.cutoff)

View File

@ -253,7 +253,10 @@ def reddit_deps(dirname_in, languages):
processed_files.append(count_file) processed_files.append(count_file)
output_file = wordlist_filename('reddit', 'en', 'counts.txt') output_file = wordlist_filename('reddit', 'en', 'counts.txt')
add_dep(lines, 'merge_counts', processed_files, output_file) add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 3}
)
return lines return lines
@ -289,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
) )
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
add_dep(lines, 'merge_counts', processed_files, output_file) add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 0}
)
return lines return lines
@ -317,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
) )
add_dep( add_dep(
lines, 'merge_counts', processed_file, output_file lines, 'merge_counts', processed_file, output_file,
params={'cutoff': 0}
) )
return lines return lines