diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 2e740cf..6413024 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli import argparse -def merge_lists(input_names, output_name, cutoff=0, max_size=1000000): +def merge_lists(input_names, output_name, cutoff=0, max_words=1000000): count_dicts = [] for input_name in input_names: - values, total = read_values(input_name, cutoff=cutoff, max_size=max_size) + values, total = read_values(input_name, cutoff=cutoff, max_words=max_words) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) @@ -22,5 +22,4 @@ if __name__ == '__main__': parser.add_argument('inputs', nargs='+', help='names of input files to merge') args = parser.parse_args() - merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words) - + merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index ded334a..65baf72 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -36,15 +36,15 @@ def count_tokens(filename): return counts -def read_values(filename, cutoff=0, max_size=1e8, lang=None): +def read_values(filename, cutoff=0, max_words=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. - In addition, only up to `max_size` words are read. + In addition, only up to `max_words` words are read. - If `cutoff` is greater than 0 or `max_size` is smaller than the list, + If `cutoff` is greater than 0 or `max_words` is smaller than the list, the csv file must be sorted by value in descending order, so that the most frequent words are kept. @@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None): for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) - if val < cutoff or len(values) >= max_size: + if val < cutoff or len(values) >= max_words: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: