rename max_size to max_words consistently

This commit is contained in:
Rob Speer 2016-03-31 12:55:18 -04:00
parent 697842b3f9
commit 16059d3b9a
2 changed files with 7 additions and 8 deletions

View File

@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
import argparse import argparse
def merge_lists(input_names, output_name, cutoff=0, max_size=1000000): def merge_lists(input_names, output_name, cutoff=0, max_words=1000000):
count_dicts = [] count_dicts = []
for input_name in input_names: for input_name in input_names:
values, total = read_values(input_name, cutoff=cutoff, max_size=max_size) values, total = read_values(input_name, cutoff=cutoff, max_words=max_words)
count_dicts.append(values) count_dicts.append(values)
merged = merge_counts(count_dicts) merged = merge_counts(count_dicts)
write_wordlist(merged, output_name) write_wordlist(merged, output_name)
@ -22,5 +22,4 @@ if __name__ == '__main__':
parser.add_argument('inputs', nargs='+', parser.add_argument('inputs', nargs='+',
help='names of input files to merge') help='names of input files to merge')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words) merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words)

View File

@ -36,15 +36,15 @@ def count_tokens(filename):
return counts return counts
def read_values(filename, cutoff=0, max_size=1e8, lang=None): def read_values(filename, cutoff=0, max_words=1e8, lang=None):
""" """
Read words and their frequency or count values from a CSV file. Returns Read words and their frequency or count values from a CSV file. Returns
a dictionary of values and the total of all values. a dictionary of values and the total of all values.
Only words with a value greater than or equal to `cutoff` are returned. Only words with a value greater than or equal to `cutoff` are returned.
In addition, only up to `max_size` words are read. In addition, only up to `max_words` words are read.
If `cutoff` is greater than 0 or `max_size` is smaller than the list, If `cutoff` is greater than 0 or `max_words` is smaller than the list,
the csv file must be sorted by value in descending order, so that the the csv file must be sorted by value in descending order, so that the
most frequent words are kept. most frequent words are kept.
@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None):
for key, strval in csv.reader(infile): for key, strval in csv.reader(infile):
val = float(strval) val = float(strval)
key = fix_text(key) key = fix_text(key)
if val < cutoff or len(values) >= max_size: if val < cutoff or len(values) >= max_words:
break break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens: for token in tokens: