Stylistic cleanups to word_counts.py.

This commit is contained in:
Andrew Lin 2015-07-31 19:26:18 -04:00
parent 66c69e6fac
commit 6d40912ef9

View File

@ -39,20 +39,16 @@ def read_freqs(filename, cutoff=0, lang=None):
raw_counts = defaultdict(float) raw_counts = defaultdict(float)
total = 0. total = 0.
with open(filename, encoding='utf-8', newline='') as infile: with open(filename, encoding='utf-8', newline='') as infile:
reader = csv.reader(infile) for key, strval in csv.reader(infile):
for key, strval in reader:
val = float(strval) val = float(strval)
if val < cutoff: if val < cutoff:
break break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens: for token in tokens:
token = fix_text(token)
total += val
# Use += so that, if we give the reader concatenated files with # Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing # duplicates, it does the right thing
raw_counts[token] += val raw_counts[fix_text(token)] += val
total += val
for word in raw_counts: for word in raw_counts:
raw_counts[word] /= total raw_counts[word] /= total