mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
make max-words a real, documented parameter
Former-commit-id: 178a8b1494
This commit is contained in:
parent
23c5c4adca
commit
670ab12f54
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name, cutoff=0):
|
def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
|
values, total = read_values(input_name, cutoff=cutoff, max_size=max_size)
|
||||||
count_dicts.append(values)
|
count_dicts.append(values)
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
@ -15,7 +15,8 @@ if __name__ == '__main__':
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
||||||
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
|
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
|
||||||
|
parser.add_argument('-m', '--max-words', type=int, default=1000000, help='maximum number of words to read from each list')
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff)
|
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
|
||||||
|
|
||||||
|
@ -42,9 +42,11 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
|||||||
a dictionary of values and the total of all values.
|
a dictionary of values and the total of all values.
|
||||||
|
|
||||||
Only words with a value greater than or equal to `cutoff` are returned.
|
Only words with a value greater than or equal to `cutoff` are returned.
|
||||||
|
In addition, only up to `max_size` words are read.
|
||||||
|
|
||||||
If `cutoff` is greater than 0, the csv file must be sorted by value
|
If `cutoff` is greater than 0 or `max_size` is smaller than the list,
|
||||||
in descending order.
|
the csv file must be sorted by value in descending order, so that the
|
||||||
|
most frequent words are kept.
|
||||||
|
|
||||||
If `lang` is given, it will apply language-specific tokenization to the
|
If `lang` is given, it will apply language-specific tokenization to the
|
||||||
words that it reads.
|
words that it reads.
|
||||||
|
Loading…
Reference in New Issue
Block a user