From 9a2855394d0d57478ac83ecdd092885de633623e Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 30 Jun 2015 15:18:37 -0400 Subject: [PATCH] fix comment and whitespace involving tokenize_twitter --- wordfreq_builder/rules.ninja | 8 ++------ wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 0c72cfe..4be9f25 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -46,12 +46,8 @@ rule wiki2tokens rule tokenize_japanese command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out -# Tokenizing text from Twitter generally requires us to use a more powerful -# tokenizer than the language-agnostic one. -# -# Our usual build process does not use this step. It just assumes it's already -# done, because it takes a very long time. This is what the 'data/intermediate' -# directory contains. +# Tokenizing text from Twitter requires us to language-detect and tokenize +# in the same step. rule tokenize_twitter command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py index 87fc171..df2cb6b 100644 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py @@ -4,7 +4,7 @@ import argparse def tokenize_twitter(in_filename, out_prefix): tokenize_file(in_filename, out_prefix, - tokenizer=cld2_surface_tokenizer) + tokenizer=cld2_surface_tokenizer) def main():