mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
gzip the intermediate step of Reddit word counting
This commit is contained in:
parent
95f53e295b
commit
9a5d9d66bb
@ -101,4 +101,4 @@ rule cat
|
||||
command = cat $in > $out
|
||||
|
||||
rule extract_reddit
|
||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' | gzip -c > $out
|
||||
|
@ -246,7 +246,7 @@ def reddit_deps(dirname_in, languages):
|
||||
path_in = pathlib.Path(dirname_in)
|
||||
for filepath in path_in.glob('*/*.bz2'):
|
||||
base = filepath.name[:-4]
|
||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
|
||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
|
||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||
add_dep(lines, 'count', transformed_file, count_file)
|
||||
|
@ -24,12 +24,15 @@ def count_tokens(filename):
|
||||
containing '<EFBFBD>'.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
for line in infile:
|
||||
line = URL_RE.sub('', line.strip())
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
|
||||
if filename.endswith('gz'):
|
||||
infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
|
||||
else:
|
||||
infile = open(filename, encoding='utf-8', errors='replace')
|
||||
for line in infile:
|
||||
line = URL_RE.sub('', line.strip())
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
infile.close()
|
||||
return counts
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user