mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
a9a4483ca3
commit
a2bc90e430
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
|
def merge_lists(input_names, output_name, cutoff=0, max_words=1000000):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
values, total = read_values(input_name, cutoff=cutoff, max_size=max_size)
|
values, total = read_values(input_name, cutoff=cutoff, max_words=max_words)
|
||||||
count_dicts.append(values)
|
count_dicts.append(values)
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
@ -22,5 +22,4 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('inputs', nargs='+',
|
parser.add_argument('inputs', nargs='+',
|
||||||
help='names of input files to merge')
|
help='names of input files to merge')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
|
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words)
|
||||||
|
|
||||||
|
@ -36,15 +36,15 @@ def count_tokens(filename):
|
|||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
def read_values(filename, cutoff=0, max_words=1e8, lang=None):
|
||||||
"""
|
"""
|
||||||
Read words and their frequency or count values from a CSV file. Returns
|
Read words and their frequency or count values from a CSV file. Returns
|
||||||
a dictionary of values and the total of all values.
|
a dictionary of values and the total of all values.
|
||||||
|
|
||||||
Only words with a value greater than or equal to `cutoff` are returned.
|
Only words with a value greater than or equal to `cutoff` are returned.
|
||||||
In addition, only up to `max_size` words are read.
|
In addition, only up to `max_words` words are read.
|
||||||
|
|
||||||
If `cutoff` is greater than 0 or `max_size` is smaller than the list,
|
If `cutoff` is greater than 0 or `max_words` is smaller than the list,
|
||||||
the csv file must be sorted by value in descending order, so that the
|
the csv file must be sorted by value in descending order, so that the
|
||||||
most frequent words are kept.
|
most frequent words are kept.
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
|||||||
for key, strval in csv.reader(infile):
|
for key, strval in csv.reader(infile):
|
||||||
val = float(strval)
|
val = float(strval)
|
||||||
key = fix_text(key)
|
key = fix_text(key)
|
||||||
if val < cutoff or len(values) >= max_size:
|
if val < cutoff or len(values) >= max_words:
|
||||||
break
|
break
|
||||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
Loading…
Reference in New Issue
Block a user