mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
parent
a9a4483ca3
commit
a2bc90e430
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
||||
import argparse
|
||||
|
||||
|
||||
def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
|
||||
def merge_lists(input_names, output_name, cutoff=0, max_words=1000000):
|
||||
count_dicts = []
|
||||
for input_name in input_names:
|
||||
values, total = read_values(input_name, cutoff=cutoff, max_size=max_size)
|
||||
values, total = read_values(input_name, cutoff=cutoff, max_words=max_words)
|
||||
count_dicts.append(values)
|
||||
merged = merge_counts(count_dicts)
|
||||
write_wordlist(merged, output_name)
|
||||
@ -22,5 +22,4 @@ if __name__ == '__main__':
|
||||
parser.add_argument('inputs', nargs='+',
|
||||
help='names of input files to merge')
|
||||
args = parser.parse_args()
|
||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
|
||||
|
||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words)
|
||||
|
@ -36,15 +36,15 @@ def count_tokens(filename):
|
||||
return counts
|
||||
|
||||
|
||||
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
||||
def read_values(filename, cutoff=0, max_words=1e8, lang=None):
|
||||
"""
|
||||
Read words and their frequency or count values from a CSV file. Returns
|
||||
a dictionary of values and the total of all values.
|
||||
|
||||
Only words with a value greater than or equal to `cutoff` are returned.
|
||||
In addition, only up to `max_size` words are read.
|
||||
In addition, only up to `max_words` words are read.
|
||||
|
||||
If `cutoff` is greater than 0 or `max_size` is smaller than the list,
|
||||
If `cutoff` is greater than 0 or `max_words` is smaller than the list,
|
||||
the csv file must be sorted by value in descending order, so that the
|
||||
most frequent words are kept.
|
||||
|
||||
@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
||||
for key, strval in csv.reader(infile):
|
||||
val = float(strval)
|
||||
key = fix_text(key)
|
||||
if val < cutoff or len(values) >= max_size:
|
||||
if val < cutoff or len(values) >= max_words:
|
||||
break
|
||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||
for token in tokens:
|
||||
|
Loading…
Reference in New Issue
Block a user