fixed tokenize_twitter

This commit is contained in:
Joshua Chin 2015-07-17 16:37:47 -04:00
parent a44927e98e
commit f31f9a1bcd

View File

@ -2,18 +2,12 @@ from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
import argparse
def tokenize_twitter(in_filename, out_prefix):
tokenize_twitter(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix)
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
if __name__ == '__main__':