mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
created last_tab flag
This commit is contained in:
parent
620becb7e8
commit
d6519cf736
@ -2,17 +2,10 @@ from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
|
||||
import argparse
|
||||
|
||||
|
||||
def last_tab(line):
|
||||
"""
|
||||
Read lines by keeping only the last tab-separated value.
|
||||
"""
|
||||
return line.split('\t')[-1].strip()
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix):
|
||||
tokenize_file(in_filename, out_prefix,
|
||||
tokenizer=cld2_surface_tokenizer,
|
||||
line_reader=last_tab)
|
||||
tokenizer=cld2_surface_tokenizer
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -40,7 +40,7 @@ def cld2_detect_language(text):
|
||||
return pycld2.detect(text)[2][0][1]
|
||||
|
||||
|
||||
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=None):
|
||||
def tokenize_file(in_filename, out_prefix, tokenizer, last_tab=True):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, sorting the
|
||||
results by the language of each line, and inserting newlines
|
||||
@ -49,10 +49,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=None):
|
||||
out_files = {}
|
||||
with open(in_filename, encoding='utf-8') as in_file:
|
||||
for line in in_file:
|
||||
if line_reader is not None:
|
||||
text = line_reader(line)
|
||||
else:
|
||||
text = line
|
||||
text = line.split('\t')[-1].strip()
|
||||
language, tokens = tokenizer(text)
|
||||
if language != 'un':
|
||||
tokenized = '\n'.join(tokens)
|
||||
|
@ -88,7 +88,7 @@ def merge_freqs(freq_dicts):
|
||||
"""
|
||||
vocab = set()
|
||||
for freq_dict in freq_dicts:
|
||||
vocab |= set(freq_dict)
|
||||
vocab.update(freq_dict)
|
||||
|
||||
merged = defaultdict(float)
|
||||
N = len(freq_dicts)
|
||||
|
Loading…
Reference in New Issue
Block a user