diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 032d80a..a1dc1c7 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -35,7 +35,11 @@ rule tokenize_twitter # This rule uses command-line tools to take in a file with one token per line, # and output a comma-separated file with the token counts: # -# * 'sort $in | uniq -c' does the actual counting. +# * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte +# order, which is fine because we only need this order so we can run +# 'uniq'. +# * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly +# large amount of temporary output goes in $tmp. # * 'sort -nrk 1' sorts the result in reverse numeric order by the first field # (the count). # * The 'sed' command rearranges the lines to be comma-separated values with @@ -43,7 +47,7 @@ rule tokenize_twitter # number at the start of the line. # rule count - command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out + command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out rule cat command = cat $in > $out diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index ab74ec8..1c58154 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -5,6 +5,7 @@ import pathlib HEADER = """# This file is automatically generated. Do not edit it. # You can regenerate it using the 'wordfreq-build-deps' command. """ +TMPDIR = data_filename('tmp') def make_ninja_deps(rules_filename, out=sys.stdout): @@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout): print(rulesfile.read(), file=out) lines = ( - language_detect_and_tokenize_deps( + twitter_deps( data_filename('raw-input/twitter/all-2014.txt'), slice_prefix=data_filename('slices/twitter/tweets-2014'), - combined_prefix=data_filename('generated/twitter/tweets-2014'), + combined_prefix=data_filename('intermediate/twitter/tweets-2014'), slices=40 ) + - wiki_parse_deps( + wikipedia_deps( data_filename('raw-input/wikipedia'), data_filename('generated/wikipedia'), CONFIG['wp_languages'] @@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout): print('\n'.join(lines), file=out) -def wiki_parse_deps(dirname_in, dirname_out, languages): +def wikipedia_deps(dirname_in, dirname_out, languages): lines = [] path_in = pathlib.Path(dirname_in) path_out = pathlib.Path(dirname_out) @@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages): outs=output_file, ins=input_file ) lines.append(build_rule) + + token_file = output_file + output_file = path_out / 'wikipedia_{}.counts.txt'.format(language) + build_rule = "build {outs}: count {ins}".format( + outs=output_file, ins=token_file + ) + lines.append(build_rule) + lines.append(" tmp = {}".format(TMPDIR)) return lines -def language_detect_and_tokenize_deps(input_filename, slice_prefix, - combined_prefix, slices): +def twitter_deps(input_filename, slice_prefix, + combined_prefix, slices): lines = [] # split the input into slices slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)