wordfreq/wordfreq_builder/rules.ninja
2015-04-30 16:24:28 -04:00

35 lines
1.3 KiB
Plaintext

# This defines the rules on how to build parts of the wordfreq lists, using the
# Ninja build system:
#
# http://martine.github.io/ninja/manual.html
#
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
# better parallelism and the ability for build steps to produce multiple
# outputs. The tradeoff is that its rule syntax isn't full of magic for
# expanding wildcards and finding dependencies, so in general you have to
# write the dependencies using a script.
#
# This file will become the header of the larger build.ninja file, which also
# contains the programatically-defined dependency graph.
# Variables
DATA = ./data
# Splits the single file $in into $slices parts, whose names will be
# $prefix plus a two-digit numeric suffix.
rule split
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
# wiki2text is a tool I wrote using the development version of Nim, which
# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
# The code is at https://github.com/rspeer/wiki2text, but right now it'll
# take a bit of setup to get it to run.
rule wiki2text
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix
rule cat
command = cat $in > $out