rename max_size to max_words consistently

2024-12-23 09:21:37 +00:00 · 2016-03-31 12:55:18 -04:00 · 2016-03-31 12:55:18 -04:00 · 16059d3b9a
commit 16059d3b9a
parent 697842b3f9
2 changed files with 7 additions and 8 deletions
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
 import argparse


-def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
+def merge_lists(input_names, output_name, cutoff=0, max_words=1000000):
    count_dicts = []
    for input_name in input_names:
-        values, total = read_values(input_name, cutoff=cutoff, max_size=max_size)
+        values, total = read_values(input_name, cutoff=cutoff, max_words=max_words)
        count_dicts.append(values)
    merged = merge_counts(count_dicts)
    write_wordlist(merged, output_name)
@ -22,5 +22,4 @@ if __name__ == '__main__':
    parser.add_argument('inputs', nargs='+',
                        help='names of input files to merge')
    args = parser.parse_args()
-    merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
-
+    merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words)
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -36,15 +36,15 @@ def count_tokens(filename):
    return counts


-def read_values(filename, cutoff=0, max_size=1e8, lang=None):
+def read_values(filename, cutoff=0, max_words=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.

    Only words with a value greater than or equal to `cutoff` are returned.
-    In addition, only up to `max_size` words are read.
+    In addition, only up to `max_words` words are read.

-    If `cutoff` is greater than 0 or `max_size` is smaller than the list,
+    If `cutoff` is greater than 0 or `max_words` is smaller than the list,
    the csv file must be sorted by value in descending order, so that the
    most frequent words are kept.

@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None):
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
-            if val < cutoff or len(values) >= max_size:
+            if val < cutoff or len(values) >= max_words:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens: