mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
builder: Use an optional cutoff when merging counts
This allows the Reddit-merging step to not use such a ludicrous amount of memory.
This commit is contained in:
parent
9a5d9d66bb
commit
973caca253
@ -92,7 +92,7 @@ rule merge
|
|||||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
||||||
|
|
||||||
rule merge_counts
|
rule merge_counts
|
||||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
||||||
|
|
||||||
rule freqs2cB
|
rule freqs2cB
|
||||||
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||||
|
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name):
|
def merge_lists(input_names, output_name, cutoff=0):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
values, total = read_values(input_name, cutoff=0, max_size=1000000)
|
values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
|
||||||
count_dicts.append(values)
|
count_dicts.append(values)
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
@ -14,7 +14,8 @@ def merge_lists(input_names, output_name):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
||||||
|
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output)
|
merge_lists(args.inputs, args.output, cutoff=args.cutoff)
|
||||||
|
|
||||||
|
@ -253,7 +253,10 @@ def reddit_deps(dirname_in, languages):
|
|||||||
processed_files.append(count_file)
|
processed_files.append(count_file)
|
||||||
|
|
||||||
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
add_dep(
|
||||||
|
lines, 'merge_counts', processed_files, output_file,
|
||||||
|
params={'cutoff': 3}
|
||||||
|
)
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
@ -289,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
|
|||||||
)
|
)
|
||||||
|
|
||||||
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
add_dep(
|
||||||
|
lines, 'merge_counts', processed_files, output_file,
|
||||||
|
params={'cutoff': 0}
|
||||||
|
)
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
@ -317,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
|
|||||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
||||||
)
|
)
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'merge_counts', processed_file, output_file
|
lines, 'merge_counts', processed_file, output_file,
|
||||||
|
params={'cutoff': 0}
|
||||||
)
|
)
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user