diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 7aefa4e..f039705 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -101,4 +101,4 @@ rule cat command = cat $in > $out rule extract_reddit - command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out + command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index ad433aa..ef8c368 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -246,7 +246,7 @@ def reddit_deps(dirname_in, languages): path_in = pathlib.Path(dirname_in) for filepath in path_in.glob('*/*.bz2'): base = filepath.name[:-4] - transformed_file = wordlist_filename('reddit', 'en', base + '.txt') + transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz') add_dep(lines, 'extract_reddit', str(filepath), transformed_file) count_file = wordlist_filename('reddit', 'en', base + '.counts.txt') add_dep(lines, 'count', transformed_file, count_file) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index a21e1f3..47de7e5 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -24,12 +24,15 @@ def count_tokens(filename): containing '�'. """ counts = defaultdict(int) - with open(filename, encoding='utf-8', errors='replace') as infile: - for line in infile: - line = URL_RE.sub('', line.strip()) - for token in simple_tokenize(line): - counts[token] += 1 - + if filename.endswith('gz'): + infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace') + else: + infile = open(filename, encoding='utf-8', errors='replace') + for line in infile: + line = URL_RE.sub('', line.strip()) + for token in simple_tokenize(line): + counts[token] += 1 + infile.close() return counts