From 178a8b14940911d37cb9d99e8d1e80060933addd Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 24 Mar 2016 14:10:02 -0400 Subject: [PATCH] make max-words a real, documented parameter --- wordfreq_builder/wordfreq_builder/cli/merge_counts.py | 7 ++++--- wordfreq_builder/wordfreq_builder/word_counts.py | 6 ++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index c44f0cf..d3be5ce 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli import argparse -def merge_lists(input_names, output_name, cutoff=0): +def merge_lists(input_names, output_name, cutoff=0, max_size=1000000): count_dicts = [] for input_name in input_names: - values, total = read_values(input_name, cutoff=cutoff, max_size=1000000) + values, total = read_values(input_name, cutoff=cutoff, max_size=max_size) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) @@ -15,7 +15,8 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file') + parser.add_argument('-m', '--max-words', type=int, default=1000000, help='maximum number of words to read from each list') parser.add_argument('inputs', help='names of input files to merge', nargs='+') args = parser.parse_args() - merge_lists(args.inputs, args.output, cutoff=args.cutoff) + merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 47de7e5..ded334a 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -42,9 +42,11 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None): a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. + In addition, only up to `max_size` words are read. - If `cutoff` is greater than 0, the csv file must be sorted by value - in descending order. + If `cutoff` is greater than 0 or `max_size` is smaller than the list, + the csv file must be sorted by value in descending order, so that the + most frequent words are kept. If `lang` is given, it will apply language-specific tokenization to the words that it reads.