diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index ac9d4a0..c05841a 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -95,7 +95,7 @@ rule merge_counts command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in rule freqs2cB - command = python -m wordfreq_builder.cli.freqs_to_cB $in $out + command = python -m wordfreq_builder.cli.freqs_to_cB $in $out -b $buckets rule cat command = cat $in > $out diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py index 73edb44..5dc6966 100644 --- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py @@ -6,5 +6,9 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_out', help='name of output file') + parser.add_argument('-b', '--buckets', type=int, default=600, + help='Number of centibel buckets to include (default 600). ' + 'Increasing this number creates a longer wordlist with ' + 'rarer words.') args = parser.parse_args() - freqs_to_cBpack(args.filename_in, args.filename_out) + freqs_to_cBpack(args.filename_in, args.filename_out, cutoff=-(args.buckets)) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index e0006e1..92c1029 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -56,6 +56,7 @@ CONFIG = { 'reddit': 'generated/reddit/reddit_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}', + 'combined-dist-large': 'dist/combined-large_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}', 'jieba-dist': 'dist/jieba_{lang}.{ext}' }, diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index dc2a058..7487f75 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -345,9 +345,15 @@ def combine_lists(languages): output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz' ) + output_cBpack_big = wordlist_filename( + 'combined-dist-large', language, 'msgpack.gz' + ) add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', - params={'lang': language}) + params={'lang': language, 'buckets': 600}) + add_dep(lines, 'freqs2cB', output_file, output_cBpack_big, + extra='wordfreq_builder/word_counts.py', + params={'lang': language, 'buckets': 900}) lines.append('default {}'.format(output_cBpack)) @@ -358,7 +364,7 @@ def combine_lists(languages): 'twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', - params={'lang': language}) + params={'lang': language, 'buckets': 600}) lines.append('default {}'.format(output_cBpack))