mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
build a bigger wordlist that we can optionally use
Former-commit-id: df8caaff7d
This commit is contained in:
parent
2069e30c89
commit
738243e244
@ -95,7 +95,7 @@ rule merge_counts
|
|||||||
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
||||||
|
|
||||||
rule freqs2cB
|
rule freqs2cB
|
||||||
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out -b $buckets
|
||||||
|
|
||||||
rule cat
|
rule cat
|
||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
@ -6,5 +6,9 @@ if __name__ == '__main__':
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||||
parser.add_argument('filename_out', help='name of output file')
|
parser.add_argument('filename_out', help='name of output file')
|
||||||
|
parser.add_argument('-b', '--buckets', type=int, default=600,
|
||||||
|
help='Number of centibel buckets to include (default 600). '
|
||||||
|
'Increasing this number creates a longer wordlist with '
|
||||||
|
'rarer words.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
freqs_to_cBpack(args.filename_in, args.filename_out)
|
freqs_to_cBpack(args.filename_in, args.filename_out, cutoff=-(args.buckets))
|
||||||
|
@ -56,6 +56,7 @@ CONFIG = {
|
|||||||
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
|
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
|
||||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||||
|
'combined-dist-large': 'dist/combined-large_{lang}.{ext}',
|
||||||
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
||||||
'jieba-dist': 'dist/jieba_{lang}.{ext}'
|
'jieba-dist': 'dist/jieba_{lang}.{ext}'
|
||||||
},
|
},
|
||||||
|
@ -345,9 +345,15 @@ def combine_lists(languages):
|
|||||||
output_cBpack = wordlist_filename(
|
output_cBpack = wordlist_filename(
|
||||||
'combined-dist', language, 'msgpack.gz'
|
'combined-dist', language, 'msgpack.gz'
|
||||||
)
|
)
|
||||||
|
output_cBpack_big = wordlist_filename(
|
||||||
|
'combined-dist-large', language, 'msgpack.gz'
|
||||||
|
)
|
||||||
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||||
extra='wordfreq_builder/word_counts.py',
|
extra='wordfreq_builder/word_counts.py',
|
||||||
params={'lang': language})
|
params={'lang': language, 'buckets': 600})
|
||||||
|
add_dep(lines, 'freqs2cB', output_file, output_cBpack_big,
|
||||||
|
extra='wordfreq_builder/word_counts.py',
|
||||||
|
params={'lang': language, 'buckets': 900})
|
||||||
|
|
||||||
lines.append('default {}'.format(output_cBpack))
|
lines.append('default {}'.format(output_cBpack))
|
||||||
|
|
||||||
@ -358,7 +364,7 @@ def combine_lists(languages):
|
|||||||
'twitter-dist', language, 'msgpack.gz')
|
'twitter-dist', language, 'msgpack.gz')
|
||||||
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
|
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
|
||||||
extra='wordfreq_builder/word_counts.py',
|
extra='wordfreq_builder/word_counts.py',
|
||||||
params={'lang': language})
|
params={'lang': language, 'buckets': 600})
|
||||||
|
|
||||||
lines.append('default {}'.format(output_cBpack))
|
lines.append('default {}'.format(output_cBpack))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user