From f5e09f3f3da6cb4ca43ac522d4d173ea88238694 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 9 Dec 2015 13:30:08 -0500 Subject: [PATCH] gzip the intermediate step of Reddit word counting Former-commit-id: 9a5d9d66bb206357af5443b0b7f0bf81a4243e45 --- wordfreq_builder/rules.ninja | 2 +- wordfreq_builder/wordfreq_builder/ninja.py | 2 +- wordfreq_builder/wordfreq_builder/word_counts.py | 15 +++++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 7aefa4e..f039705 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -101,4 +101,4 @@ rule cat command = cat $in > $out rule extract_reddit - command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out + command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index ad433aa..ef8c368 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -246,7 +246,7 @@ def reddit_deps(dirname_in, languages): path_in = pathlib.Path(dirname_in) for filepath in path_in.glob('*/*.bz2'): base = filepath.name[:-4] - transformed_file = wordlist_filename('reddit', 'en', base + '.txt') + transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz') add_dep(lines, 'extract_reddit', str(filepath), transformed_file) count_file = wordlist_filename('reddit', 'en', base + '.counts.txt') add_dep(lines, 'count', transformed_file, count_file) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index a21e1f3..47de7e5 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -24,12 +24,15 @@ def count_tokens(filename): containing '�'. """ counts = defaultdict(int) - with open(filename, encoding='utf-8', errors='replace') as infile: - for line in infile: - line = URL_RE.sub('', line.strip()) - for token in simple_tokenize(line): - counts[token] += 1 - + if filename.endswith('gz'): + infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace') + else: + infile = open(filename, encoding='utf-8', errors='replace') + for line in infile: + line = URL_RE.sub('', line.strip()) + for token in simple_tokenize(line): + counts[token] += 1 + infile.close() return counts