From a2bc90e430c7b153f760ddb85e7de32b49d58921 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 31 Mar 2016 12:55:18 -0400 Subject: [PATCH] rename max_size to max_words consistently Former-commit-id: 16059d3b9a3ef8b1c332d75df6137bcd09fe83a2 --- wordfreq_builder/wordfreq_builder/cli/merge_counts.py | 7 +++---- wordfreq_builder/wordfreq_builder/word_counts.py | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 2e740cf..6413024 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli import argparse -def merge_lists(input_names, output_name, cutoff=0, max_size=1000000): +def merge_lists(input_names, output_name, cutoff=0, max_words=1000000): count_dicts = [] for input_name in input_names: - values, total = read_values(input_name, cutoff=cutoff, max_size=max_size) + values, total = read_values(input_name, cutoff=cutoff, max_words=max_words) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) @@ -22,5 +22,4 @@ if __name__ == '__main__': parser.add_argument('inputs', nargs='+', help='names of input files to merge') args = parser.parse_args() - merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words) - + merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index ded334a..65baf72 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -36,15 +36,15 @@ def count_tokens(filename): return counts -def read_values(filename, cutoff=0, max_size=1e8, lang=None): +def read_values(filename, cutoff=0, max_words=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. - In addition, only up to `max_size` words are read. + In addition, only up to `max_words` words are read. - If `cutoff` is greater than 0 or `max_size` is smaller than the list, + If `cutoff` is greater than 0 or `max_words` is smaller than the list, the csv file must be sorted by value in descending order, so that the most frequent words are kept. @@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None): for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) - if val < cutoff or len(values) >= max_size: + if val < cutoff or len(values) >= max_words: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: