build a bigger wordlist that we can optionally use

Former-commit-id: df8caaff7d
This commit is contained in:
Robyn Speer 2016-01-12 14:05:17 -05:00
parent 2069e30c89
commit 738243e244
4 changed files with 15 additions and 4 deletions

View File

@ -95,7 +95,7 @@ rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
rule freqs2cB rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out command = python -m wordfreq_builder.cli.freqs_to_cB $in $out -b $buckets
rule cat rule cat
command = cat $in > $out command = cat $in > $out

View File

@ -6,5 +6,9 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file') parser.add_argument('filename_out', help='name of output file')
parser.add_argument('-b', '--buckets', type=int, default=600,
help='Number of centibel buckets to include (default 600). '
'Increasing this number creates a longer wordlist with '
'rarer words.')
args = parser.parse_args() args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out) freqs_to_cBpack(args.filename_in, args.filename_out, cutoff=-(args.buckets))

View File

@ -56,6 +56,7 @@ CONFIG = {
'reddit': 'generated/reddit/reddit_{lang}.{ext}', 'reddit': 'generated/reddit/reddit_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}',
'combined-dist-large': 'dist/combined-large_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}',
'jieba-dist': 'dist/jieba_{lang}.{ext}' 'jieba-dist': 'dist/jieba_{lang}.{ext}'
}, },

View File

@ -345,9 +345,15 @@ def combine_lists(languages):
output_cBpack = wordlist_filename( output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz' 'combined-dist', language, 'msgpack.gz'
) )
output_cBpack_big = wordlist_filename(
'combined-dist-large', language, 'msgpack.gz'
)
add_dep(lines, 'freqs2cB', output_file, output_cBpack, add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py', extra='wordfreq_builder/word_counts.py',
params={'lang': language}) params={'lang': language, 'buckets': 600})
add_dep(lines, 'freqs2cB', output_file, output_cBpack_big,
extra='wordfreq_builder/word_counts.py',
params={'lang': language, 'buckets': 900})
lines.append('default {}'.format(output_cBpack)) lines.append('default {}'.format(output_cBpack))
@ -358,7 +364,7 @@ def combine_lists(languages):
'twitter-dist', language, 'msgpack.gz') 'twitter-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', input_file, output_cBpack, add_dep(lines, 'freqs2cB', input_file, output_cBpack,
extra='wordfreq_builder/word_counts.py', extra='wordfreq_builder/word_counts.py',
params={'lang': language}) params={'lang': language, 'buckets': 600})
lines.append('default {}'.format(output_cBpack)) lines.append('default {}'.format(output_cBpack))