rename max_size to max_words consistently

This commit is contained in:
Rob Speer 2016-03-31 12:55:18 -04:00
parent 697842b3f9
commit 16059d3b9a
2 changed files with 7 additions and 8 deletions

View File

@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
import argparse
def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
def merge_lists(input_names, output_name, cutoff=0, max_words=1000000):
count_dicts = []
for input_name in input_names:
values, total = read_values(input_name, cutoff=cutoff, max_size=max_size)
values, total = read_values(input_name, cutoff=cutoff, max_words=max_words)
count_dicts.append(values)
merged = merge_counts(count_dicts)
write_wordlist(merged, output_name)
@ -22,5 +22,4 @@ if __name__ == '__main__':
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args()
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words)

View File

@ -36,15 +36,15 @@ def count_tokens(filename):
return counts
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
def read_values(filename, cutoff=0, max_words=1e8, lang=None):
"""
Read words and their frequency or count values from a CSV file. Returns
a dictionary of values and the total of all values.
Only words with a value greater than or equal to `cutoff` are returned.
In addition, only up to `max_size` words are read.
In addition, only up to `max_words` words are read.
If `cutoff` is greater than 0 or `max_size` is smaller than the list,
If `cutoff` is greater than 0 or `max_words` is smaller than the list,
the csv file must be sorted by value in descending order, so that the
most frequent words are kept.
@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None):
for key, strval in csv.reader(infile):
val = float(strval)
key = fix_text(key)
if val < cutoff or len(values) >= max_size:
if val < cutoff or len(values) >= max_words:
break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens: