From 7d1719cfb4a2c45c445b0c0342e4270ceb1d242d Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 15 Dec 2015 14:44:34 -0500 Subject: [PATCH] builder: Use an optional cutoff when merging counts This allows the Reddit-merging step to not use such a ludicrous amount of memory. Former-commit-id: 973caca2532197b87f3117ab15c7828565d2bec8 --- wordfreq_builder/rules.ninja | 2 +- .../wordfreq_builder/cli/merge_counts.py | 7 ++++--- wordfreq_builder/wordfreq_builder/ninja.py | 13 ++++++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index f039705..ac9d4a0 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -92,7 +92,7 @@ rule merge command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in rule merge_counts - command = python -m wordfreq_builder.cli.merge_counts -o $out $in + command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in rule freqs2cB command = python -m wordfreq_builder.cli.freqs_to_cB $in $out diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 6e5bff9..c44f0cf 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli import argparse -def merge_lists(input_names, output_name): +def merge_lists(input_names, output_name, cutoff=0): count_dicts = [] for input_name in input_names: - values, total = read_values(input_name, cutoff=0, max_size=1000000) + values, total = read_values(input_name, cutoff=cutoff, max_size=1000000) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) @@ -14,7 +14,8 @@ def merge_lists(input_names, output_name): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') + parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file') parser.add_argument('inputs', help='names of input files to merge', nargs='+') args = parser.parse_args() - merge_lists(args.inputs, args.output) + merge_lists(args.inputs, args.output, cutoff=args.cutoff) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index ef8c368..dc2a058 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -253,7 +253,10 @@ def reddit_deps(dirname_in, languages): processed_files.append(count_file) output_file = wordlist_filename('reddit', 'en', 'counts.txt') - add_dep(lines, 'merge_counts', processed_files, output_file) + add_dep( + lines, 'merge_counts', processed_files, output_file, + params={'cutoff': 3} + ) return lines @@ -289,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages): ) output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') - add_dep(lines, 'merge_counts', processed_files, output_file) + add_dep( + lines, 'merge_counts', processed_files, output_file, + params={'cutoff': 0} + ) return lines @@ -317,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages): params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} ) add_dep( - lines, 'merge_counts', processed_file, output_file + lines, 'merge_counts', processed_file, output_file, + params={'cutoff': 0} ) return lines