From 7d1719cfb4a2c45c445b0c0342e4270ceb1d242d Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 15 Dec 2015 14:44:34 -0500
Subject: [PATCH] builder: Use an optional cutoff when merging counts

This allows the Reddit-merging step to not use such a ludicrous amount
of memory.


Former-commit-id: 973caca2532197b87f3117ab15c7828565d2bec8
---
 wordfreq_builder/rules.ninja                        |  2 +-
 .../wordfreq_builder/cli/merge_counts.py            |  7 ++++---
 wordfreq_builder/wordfreq_builder/ninja.py          | 13 ++++++++++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index f039705..ac9d4a0 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -92,7 +92,7 @@ rule merge
   command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
 
 rule merge_counts
-  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
+  command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
 
 rule freqs2cB
   command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
index 6e5bff9..c44f0cf 100644
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
 import argparse
 
 
-def merge_lists(input_names, output_name):
+def merge_lists(input_names, output_name, cutoff=0):
     count_dicts = []
     for input_name in input_names:
-        values, total = read_values(input_name, cutoff=0, max_size=1000000)
+        values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
         count_dicts.append(values)
     merged = merge_counts(count_dicts)
     write_wordlist(merged, output_name)
@@ -14,7 +14,8 @@ def merge_lists(input_names, output_name):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
+    parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
     parser.add_argument('inputs', help='names of input files to merge', nargs='+')
     args = parser.parse_args()
-    merge_lists(args.inputs, args.output)
+    merge_lists(args.inputs, args.output, cutoff=args.cutoff)
 
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index ef8c368..dc2a058 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -253,7 +253,10 @@ def reddit_deps(dirname_in, languages):
         processed_files.append(count_file)
 
     output_file = wordlist_filename('reddit', 'en', 'counts.txt')
-    add_dep(lines, 'merge_counts', processed_files, output_file)
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 3}
+    )
     return lines
 
 
@@ -289,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
         )
 
     output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
-    add_dep(lines, 'merge_counts', processed_files, output_file)
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 0}
+    )
 
     return lines
 
@@ -317,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
             params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
         )
         add_dep(
-            lines, 'merge_counts', processed_file, output_file
+            lines, 'merge_counts', processed_file, output_file,
+            params={'cutoff': 0}
         )
     return lines