mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
gzip the intermediate step of Reddit word counting
Former-commit-id: 9a5d9d66bb
This commit is contained in:
parent
682e08fee2
commit
f5e09f3f3d
@ -101,4 +101,4 @@ rule cat
|
|||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
|
||||||
rule extract_reddit
|
rule extract_reddit
|
||||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' | gzip -c > $out
|
||||||
|
@ -246,7 +246,7 @@ def reddit_deps(dirname_in, languages):
|
|||||||
path_in = pathlib.Path(dirname_in)
|
path_in = pathlib.Path(dirname_in)
|
||||||
for filepath in path_in.glob('*/*.bz2'):
|
for filepath in path_in.glob('*/*.bz2'):
|
||||||
base = filepath.name[:-4]
|
base = filepath.name[:-4]
|
||||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
|
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
|
||||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||||
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||||
add_dep(lines, 'count', transformed_file, count_file)
|
add_dep(lines, 'count', transformed_file, count_file)
|
||||||
|
@ -24,12 +24,15 @@ def count_tokens(filename):
|
|||||||
containing '<EFBFBD>'.
|
containing '<EFBFBD>'.
|
||||||
"""
|
"""
|
||||||
counts = defaultdict(int)
|
counts = defaultdict(int)
|
||||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
if filename.endswith('gz'):
|
||||||
|
infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
|
||||||
|
else:
|
||||||
|
infile = open(filename, encoding='utf-8', errors='replace')
|
||||||
for line in infile:
|
for line in infile:
|
||||||
line = URL_RE.sub('', line.strip())
|
line = URL_RE.sub('', line.strip())
|
||||||
for token in simple_tokenize(line):
|
for token in simple_tokenize(line):
|
||||||
counts[token] += 1
|
counts[token] += 1
|
||||||
|
infile.close()
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user