fix comment and whitespace involving tokenize_twitter

This commit is contained in:
Rob Speer 2015-06-30 15:18:37 -04:00
parent a59070479e
commit 9a2855394d
2 changed files with 3 additions and 7 deletions

View File

@ -46,12 +46,8 @@ rule wiki2tokens
rule tokenize_japanese
command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
# Tokenizing text from Twitter generally requires us to use a more powerful
# tokenizer than the language-agnostic one.
#
# Our usual build process does not use this step. It just assumes it's already
# done, because it takes a very long time. This is what the 'data/intermediate'
# directory contains.
# Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step.
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix

View File

@ -4,7 +4,7 @@ import argparse
def tokenize_twitter(in_filename, out_prefix):
tokenize_file(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer)
tokenizer=cld2_surface_tokenizer)
def main():