builder: Use an optional cutoff when merging counts

This allows the Reddit-merging step to not use such a ludicrous amount of memory.
2024-12-23 09:21:37 +00:00 · 2015-12-15 14:44:34 -05:00 · 2015-12-15 14:44:34 -05:00 · 973caca253
commit 973caca253
parent 9a5d9d66bb
3 changed files with 15 additions and 7 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -92,7 +92,7 @@ rule merge
  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in

 rule merge_counts
-  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
+  command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in

 rule freqs2cB
  command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
 import argparse


-def merge_lists(input_names, output_name):
+def merge_lists(input_names, output_name, cutoff=0):
    count_dicts = []
    for input_name in input_names:
-        values, total = read_values(input_name, cutoff=0, max_size=1000000)
+        values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
        count_dicts.append(values)
    merged = merge_counts(count_dicts)
    write_wordlist(merged, output_name)
@ -14,7 +14,8 @@ def merge_lists(input_names, output_name):
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
+    parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
    args = parser.parse_args()
-    merge_lists(args.inputs, args.output)
+    merge_lists(args.inputs, args.output, cutoff=args.cutoff)

--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -253,7 +253,10 @@ def reddit_deps(dirname_in, languages):
        processed_files.append(count_file)

    output_file = wordlist_filename('reddit', 'en', 'counts.txt')
-    add_dep(lines, 'merge_counts', processed_files, output_file)
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 3}
+    )
    return lines


@ -289,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
        )

    output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
-    add_dep(lines, 'merge_counts', processed_files, output_file)
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 0}
+    )

    return lines

@ -317,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
        )
        add_dep(
-            lines, 'merge_counts', processed_file, output_file
+            lines, 'merge_counts', processed_file, output_file,
+            params={'cutoff': 0}
        )
    return lines