gzip the intermediate step of Reddit word counting

Former-commit-id: 9a5d9d66bb
This commit is contained in:
Robyn Speer 2015-12-09 13:30:08 -05:00
parent 682e08fee2
commit f5e09f3f3d
3 changed files with 11 additions and 8 deletions

View File

@ -101,4 +101,4 @@ rule cat
command = cat $in > $out
rule extract_reddit
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out

View File

@ -246,7 +246,7 @@ def reddit_deps(dirname_in, languages):
path_in = pathlib.Path(dirname_in)
for filepath in path_in.glob('*/*.bz2'):
base = filepath.name[:-4]
transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
add_dep(lines, 'count', transformed_file, count_file)

View File

@ -24,12 +24,15 @@ def count_tokens(filename):
containing '<EFBFBD>'.
"""
counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line):
counts[token] += 1
if filename.endswith('gz'):
infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
else:
infile = open(filename, encoding='utf-8', errors='replace')
for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line):
counts[token] += 1
infile.close()
return counts