diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 5b7c988..eaaabcd 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -44,11 +44,24 @@ rule wiki2tokens # To tokenize Japanese, we run it through Mecab and take the first column. # We don't have a plan for tokenizing Chinese yet. rule tokenize_japanese - command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS" + command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out +# Tokenizing text from Twitter generally requires us to use a more powerful +# tokenizer than the language-agnostic one. +# +# Our usual build process does not use this step. It just assumes it's already +# done, because it takes a very long time. This is what the 'data/intermediate' +# directory contains. rule tokenize_twitter command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix +# Convert the output of tokenize_twitter into the form used by wiki2tokens, +# by inserting line breaks between tokens, and removing things that appear +# to be URLs or usernames. +# +# I could have output it this way in the first place, but the fact is that I +# didn't. Writing this rule to fix it was easier than re-doing three days of +# computation. rule format_twitter command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out