mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
fixed tokenize_twitter
This commit is contained in:
parent
a44927e98e
commit
f31f9a1bcd
@ -2,18 +2,12 @@ from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
|
||||
import argparse
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix):
|
||||
tokenize_twitter(in_filename, out_prefix,
|
||||
tokenizer=cld2_surface_tokenizer
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||
args = parser.parse_args()
|
||||
tokenize_twitter(args.filename, args.outprefix)
|
||||
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Loading…
Reference in New Issue
Block a user