mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
2f73cc535c
commit
3962b475c1
@ -1,11 +1,11 @@
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
|
||||
import argparse
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix):
|
||||
tokenize_file(in_filename, out_prefix,
|
||||
tokenizer=cld2_surface_tokenizer
|
||||
)
|
||||
tokenize_twitter(in_filename, out_prefix,
|
||||
tokenizer=cld2_surface_tokenizer
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -40,7 +40,7 @@ def cld2_detect_language(text):
|
||||
return pycld2.detect(text)[2][0][1]
|
||||
|
||||
|
||||
def tokenize_file(in_filename, out_prefix, tokenizer, last_tab=True):
|
||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, sorting the
|
||||
results by the language of each line, and inserting newlines
|
||||
|
Loading…
Reference in New Issue
Block a user