wordfreq/wordfreq_builder/rules.ninja

81 lines
3.4 KiB
Plaintext
Raw Normal View History

2015-04-29 19:59:06 +00:00
# This defines the rules on how to build parts of the wordfreq lists, using the
# Ninja build system:
#
# http://martine.github.io/ninja/manual.html
2015-04-29 21:13:58 +00:00
#
2015-04-29 19:59:06 +00:00
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
# better parallelism and the ability for build steps to produce multiple
# outputs. The tradeoff is that its rule syntax isn't full of magic for
# expanding wildcards and finding dependencies, so in general you have to
# write the dependencies using a script.
#
# This file will become the header of the larger build.ninja file, which also
# contains the programatically-defined dependency graph.
# Variables
DATA = ./data
# How to build the build.ninja file itself. (Use the Makefile to get it the
# first time.)
rule build_deps
command = python -m wordfreq_builder.cli.build_deps $in > $out
2015-04-29 19:59:06 +00:00
# Splits the single file $in into $slices parts, whose names will be
# $prefix plus a two-digit numeric suffix.
rule split
2015-04-30 20:24:28 +00:00
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
2015-04-29 19:59:06 +00:00
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
# https://github.com/rspeer/wiki2text.
2015-04-29 19:59:06 +00:00
rule wiki2text
2015-04-29 21:13:58 +00:00
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
# To tokenize Japanese, we run it through Mecab and take the first column.
# We don't have a plan for tokenizing Chinese yet.
2015-05-07 20:49:53 +00:00
rule tokenize_japanese
2015-05-26 23:34:46 +00:00
command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
2015-05-07 20:49:53 +00:00
# Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step.
2015-04-29 21:13:58 +00:00
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
2015-05-07 20:49:53 +00:00
# To convert the Leeds corpus, look for space-separated lines that start with
# an integer and a decimal. The integer is the rank, which we discard. The
# decimal is the frequency, and the remaining text is the term. Use sed -n
# with /p to output only lines where the match was successful.
#
# Grep out the term "EOS", an indication that Leeds used MeCab and didn't
# strip out the EOS lines.
2015-05-07 20:49:53 +00:00
rule convert_leeds
command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
2015-05-07 20:49:53 +00:00
# To convert the OpenSubtitles frequency data, simply replace spaces with
# commas.
rule convert_opensubtitles
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
2015-04-29 21:13:58 +00:00
2015-05-11 22:44:28 +00:00
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
# the input files, keep only the single words and their counts, and only keep
# lines with counts of 100 or more.
#
# (These will still be repeated as the word appears in different grammatical
# roles, information that the source data provides that we're discarding. The
# source data was already filtered to only show words in roles with at least
# two-digit counts of occurences.)
rule convert_google_syntactic_ngrams
command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
rule count
2015-05-07 20:49:53 +00:00
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
2015-05-07 23:38:33 +00:00
rule merge
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
rule freqs2cB
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
2015-05-07 23:38:33 +00:00
2015-04-29 21:13:58 +00:00
rule cat
command = cat $in > $out