created last_tab flag

This commit is contained in:
Joshua Chin 2015-07-17 15:19:09 -04:00
parent 620becb7e8
commit d6519cf736
3 changed files with 5 additions and 15 deletions

View File

@ -2,17 +2,10 @@ from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
import argparse
def last_tab(line):
"""
Read lines by keeping only the last tab-separated value.
"""
return line.split('\t')[-1].strip()
def tokenize_twitter(in_filename, out_prefix):
tokenize_file(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer,
line_reader=last_tab)
tokenizer=cld2_surface_tokenizer
)
def main():

View File

@ -40,7 +40,7 @@ def cld2_detect_language(text):
return pycld2.detect(text)[2][0][1]
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=None):
def tokenize_file(in_filename, out_prefix, tokenizer, last_tab=True):
"""
Process a file by running it through the given tokenizer, sorting the
results by the language of each line, and inserting newlines
@ -49,10 +49,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=None):
out_files = {}
with open(in_filename, encoding='utf-8') as in_file:
for line in in_file:
if line_reader is not None:
text = line_reader(line)
else:
text = line
text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text)
if language != 'un':
tokenized = '\n'.join(tokens)

View File

@ -88,7 +88,7 @@ def merge_freqs(freq_dicts):
"""
vocab = set()
for freq_dict in freq_dicts:
vocab |= set(freq_dict)
vocab.update(freq_dict)
merged = defaultdict(float)
N = len(freq_dicts)