add rules to count wikipedia tokens

This commit is contained in:
Robyn Speer 2015-05-05 15:21:24 -04:00
parent c55e44e486
commit 7c09fec692
2 changed files with 21 additions and 8 deletions

View File

@ -35,7 +35,11 @@ rule tokenize_twitter
# This rule uses command-line tools to take in a file with one token per line, # This rule uses command-line tools to take in a file with one token per line,
# and output a comma-separated file with the token counts: # and output a comma-separated file with the token counts:
# #
# * 'sort $in | uniq -c' does the actual counting. # * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
# order, which is fine because we only need this order so we can run
# 'uniq'.
# * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
# large amount of temporary output goes in $tmp.
# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field # * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
# (the count). # (the count).
# * The 'sed' command rearranges the lines to be comma-separated values with # * The 'sed' command rearranges the lines to be comma-separated values with
@ -43,7 +47,7 @@ rule tokenize_twitter
# number at the start of the line. # number at the start of the line.
# #
rule count rule count
command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
rule cat rule cat
command = cat $in > $out command = cat $in > $out

View File

@ -5,6 +5,7 @@ import pathlib
HEADER = """# This file is automatically generated. Do not edit it. HEADER = """# This file is automatically generated. Do not edit it.
# You can regenerate it using the 'wordfreq-build-deps' command. # You can regenerate it using the 'wordfreq-build-deps' command.
""" """
TMPDIR = data_filename('tmp')
def make_ninja_deps(rules_filename, out=sys.stdout): def make_ninja_deps(rules_filename, out=sys.stdout):
@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
print(rulesfile.read(), file=out) print(rulesfile.read(), file=out)
lines = ( lines = (
language_detect_and_tokenize_deps( twitter_deps(
data_filename('raw-input/twitter/all-2014.txt'), data_filename('raw-input/twitter/all-2014.txt'),
slice_prefix=data_filename('slices/twitter/tweets-2014'), slice_prefix=data_filename('slices/twitter/tweets-2014'),
combined_prefix=data_filename('generated/twitter/tweets-2014'), combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
slices=40 slices=40
) + ) +
wiki_parse_deps( wikipedia_deps(
data_filename('raw-input/wikipedia'), data_filename('raw-input/wikipedia'),
data_filename('generated/wikipedia'), data_filename('generated/wikipedia'),
CONFIG['wp_languages'] CONFIG['wp_languages']
@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
print('\n'.join(lines), file=out) print('\n'.join(lines), file=out)
def wiki_parse_deps(dirname_in, dirname_out, languages): def wikipedia_deps(dirname_in, dirname_out, languages):
lines = [] lines = []
path_in = pathlib.Path(dirname_in) path_in = pathlib.Path(dirname_in)
path_out = pathlib.Path(dirname_out) path_out = pathlib.Path(dirname_out)
@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
outs=output_file, ins=input_file outs=output_file, ins=input_file
) )
lines.append(build_rule) lines.append(build_rule)
token_file = output_file
output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
build_rule = "build {outs}: count {ins}".format(
outs=output_file, ins=token_file
)
lines.append(build_rule)
lines.append(" tmp = {}".format(TMPDIR))
return lines return lines
def language_detect_and_tokenize_deps(input_filename, slice_prefix, def twitter_deps(input_filename, slice_prefix,
combined_prefix, slices): combined_prefix, slices):
lines = [] lines = []
# split the input into slices # split the input into slices
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)