mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
builder: Use an optional cutoff when merging counts
This allows the Reddit-merging step to not use such a ludicrous amount
of memory.
Former-commit-id: 973caca253
This commit is contained in:
parent
f5e09f3f3d
commit
7d1719cfb4
@ -92,7 +92,7 @@ rule merge
|
||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
||||
|
||||
rule merge_counts
|
||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
||||
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
||||
|
||||
rule freqs2cB
|
||||
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||
|
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
||||
import argparse
|
||||
|
||||
|
||||
def merge_lists(input_names, output_name):
|
||||
def merge_lists(input_names, output_name, cutoff=0):
|
||||
count_dicts = []
|
||||
for input_name in input_names:
|
||||
values, total = read_values(input_name, cutoff=0, max_size=1000000)
|
||||
values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
|
||||
count_dicts.append(values)
|
||||
merged = merge_counts(count_dicts)
|
||||
write_wordlist(merged, output_name)
|
||||
@ -14,7 +14,8 @@ def merge_lists(input_names, output_name):
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
||||
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
|
||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||
args = parser.parse_args()
|
||||
merge_lists(args.inputs, args.output)
|
||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff)
|
||||
|
||||
|
@ -253,7 +253,10 @@ def reddit_deps(dirname_in, languages):
|
||||
processed_files.append(count_file)
|
||||
|
||||
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_files, output_file,
|
||||
params={'cutoff': 3}
|
||||
)
|
||||
return lines
|
||||
|
||||
|
||||
@ -289,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
|
||||
)
|
||||
|
||||
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_files, output_file,
|
||||
params={'cutoff': 0}
|
||||
)
|
||||
|
||||
return lines
|
||||
|
||||
@ -317,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
|
||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
||||
)
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_file, output_file
|
||||
lines, 'merge_counts', processed_file, output_file,
|
||||
params={'cutoff': 0}
|
||||
)
|
||||
return lines
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user