renamed tokenize file to tokenize twitter

Former-commit-id: 303bd88ba2
This commit is contained in:
Joshua Chin 2015-07-17 15:27:26 -04:00
parent 2f73cc535c
commit 3962b475c1
2 changed files with 5 additions and 5 deletions

View File

@ -1,11 +1,11 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
import argparse
def tokenize_twitter(in_filename, out_prefix):
tokenize_file(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer
)
tokenize_twitter(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer
)
def main():

View File

@ -40,7 +40,7 @@ def cld2_detect_language(text):
return pycld2.detect(text)[2][0][1]
def tokenize_file(in_filename, out_prefix, tokenizer, last_tab=True):
def tokenize_twitter(in_filename, out_prefix, tokenizer):
"""
Process a file by running it through the given tokenizer, sorting the
results by the language of each line, and inserting newlines