wordfreq/wordfreq_builder/rules.ninja

# This defines the rules on how to build parts of the wordfreq lists, using the
# Ninja build system:
#
#   http://martine.github.io/ninja/manual.html
#
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
# better parallelism and the ability for build steps to produce multiple
# outputs. The tradeoff is that its rule syntax isn't full of magic for
# expanding wildcards and finding dependencies, so in general you have to
# write the dependencies using a script.
#
# This file will become the header of the larger build.ninja file, which also
# contains the programatically-defined dependency graph.

# Variables
DATA = ./data

# Splits the single file $in into $slices parts, whose names will be
# $prefix plus a two-digit numeric suffix.
rule split
  command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix

# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
# Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
# https://github.com/rspeer/wiki2text.
rule wiki2text
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out

rule wiki2tokens
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out

rule tokenize_twitter
  command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix

# This rule uses command-line tools to take in a file with one token per line,
# and output a comma-separated file with the token counts:
#
#   * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
#     order, which is fine because we only need this order so we can run
#     'uniq'.
#   * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
#     large amount of temporary output goes in $tmp.
#   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
#     (the count).
#   * The 'sed' command rearranges the lines to be comma-separated values with
#     the count coming second, instead of the count being a right-justified
#     number at the start of the line.
#
rule count
  command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out

rule cat
  command = cat $in > $out
WIP on Ninja build automation 2015-04-29 19:59:06 +00:00			`# This defines the rules on how to build parts of the wordfreq lists, using the`
			`# Ninja build system:`
			`#`
			`# http://martine.github.io/ninja/manual.html`
define some ninja rules 2015-04-29 21:13:58 +00:00			`#`
WIP on Ninja build automation 2015-04-29 19:59:06 +00:00			`# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with`
			`# better parallelism and the ability for build steps to produce multiple`
			`# outputs. The tradeoff is that its rule syntax isn't full of magic for`
			`# expanding wildcards and finding dependencies, so in general you have to`
			`# write the dependencies using a script.`
			`#`
			`# This file will become the header of the larger build.ninja file, which also`
			`# contains the programatically-defined dependency graph.`

			`# Variables`
			`DATA = ./data`

			`# Splits the single file $in into $slices parts, whose names will be`
			`# $prefix plus a two-digit numeric suffix.`
			`rule split`
WIP on new build system 2015-04-30 20:24:28 +00:00			`command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix`
WIP on Ninja build automation 2015-04-29 19:59:06 +00:00
add and adjust some build steps - more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary 2015-05-05 17:59:21 +00:00			`# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from`
			`# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at`
			`# https://github.com/rspeer/wiki2text.`
WIP on Ninja build automation 2015-04-29 19:59:06 +00:00			`rule wiki2text`
define some ninja rules 2015-04-29 21:13:58 +00:00			`command = mkdir -p $$(dirname $out) && bunzip2 -c $in \| wiki2text > $out`

add and adjust some build steps - more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary 2015-05-05 17:59:21 +00:00			`rule wiki2tokens`
			`command = mkdir -p $$(dirname $out) && bunzip2 -c $in \| wiki2text -t > $out`

define some ninja rules 2015-04-29 21:13:58 +00:00			`rule tokenize_twitter`
			`command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix`

add and adjust some build steps - more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary 2015-05-05 17:59:21 +00:00			`# This rule uses command-line tools to take in a file with one token per line,`
			`# and output a comma-separated file with the token counts:`
			`#`
add rules to count wikipedia tokens 2015-05-05 19:21:24 +00:00			`# * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte`
			`# order, which is fine because we only need this order so we can run`
			`# 'uniq'.`
			`# * 'sort $in -T $tmp \| uniq -c' does the actual counting. The possibly`
			`# large amount of temporary output goes in $tmp.`
add and adjust some build steps - more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary 2015-05-05 17:59:21 +00:00			`# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field`
			`# (the count).`
			`# * The 'sed' command rearranges the lines to be comma-separated values with`
			`# the count coming second, instead of the count being a right-justified`
			`# number at the start of the line.`
			`#`
			`rule count`
add rules to count wikipedia tokens 2015-05-05 19:21:24 +00:00			`command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp \| uniq -c \| sort -nrk 1 \| sed -r 's/\s([0-9]+)\s+(.)/\2,\1/' > $out`
add and adjust some build steps - more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary 2015-05-05 17:59:21 +00:00
define some ninja rules 2015-04-29 21:13:58 +00:00			`rule cat`
			`command = cat $in > $out`