diff --git a/wordfreq_builder/lib/jq-linux64 b/wordfreq_builder/lib/jq-linux64 new file mode 100755 index 0000000..939227e Binary files /dev/null and b/wordfreq_builder/lib/jq-linux64 differ diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index df00062..7aefa4e 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -13,7 +13,7 @@ # contains the programatically-defined dependency graph. # Variables -DATA = ./data +JQ = lib/jq-linux64 # How to build the build.ninja file itself. (Use the Makefile to get it the # first time.) @@ -99,3 +99,6 @@ rule freqs2cB rule cat command = cat $in > $out + +rule extract_reddit + command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 5e3de69..6e5bff9 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -5,7 +5,7 @@ import argparse def merge_lists(input_names, output_name): count_dicts = [] for input_name in input_names: - values, total = read_values(input_name, cutoff=0) + values, total = read_values(input_name, cutoff=0, max_size=1000000) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 7ae1798..e0006e1 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -40,7 +40,8 @@ CONFIG = { ], 'subtlex-en': ['en'], 'subtlex-other': ['de', 'nl', 'zh'], - 'jieba': ['zh'] + 'jieba': ['zh'], + 'reddit': ['en'], }, # Subtlex languages that need to be pre-processed 'wordlist_paths': { @@ -52,6 +53,7 @@ CONFIG = { 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', 'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}', 'jieba': 'generated/jieba/jieba_{lang}.{ext}', + 'reddit': 'generated/reddit/reddit_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}', diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 80437ff..ad433aa 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout): data_filename('source-lists/subtlex'), CONFIG['sources']['subtlex-other'] ), + reddit_deps( + data_filename('raw-input/reddit'), + CONFIG['sources']['reddit'] + ), jieba_deps( data_filename('source-lists/jieba'), CONFIG['sources']['jieba'] @@ -232,6 +236,27 @@ def jieba_deps(dirname_in, languages): return lines +def reddit_deps(dirname_in, languages): + lines = [] + if not languages: + return lines + assert languages == ['en'] + + processed_files = [] + path_in = pathlib.Path(dirname_in) + for filepath in path_in.glob('*/*.bz2'): + base = filepath.name[:-4] + transformed_file = wordlist_filename('reddit', 'en', base + '.txt') + add_dep(lines, 'extract_reddit', str(filepath), transformed_file) + count_file = wordlist_filename('reddit', 'en', base + '.counts.txt') + add_dep(lines, 'count', transformed_file, count_file) + processed_files.append(count_file) + + output_file = wordlist_filename('reddit', 'en', 'counts.txt') + add_dep(lines, 'merge_counts', processed_files, output_file) + return lines + + # Which columns of the SUBTLEX data files do the word and its frequency appear # in? SUBTLEX_COLUMN_MAP = { diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index a3bf0ae..a21e1f3 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -33,7 +33,7 @@ def count_tokens(filename): return counts -def read_values(filename, cutoff=0, lang=None): +def read_values(filename, cutoff=0, max_size=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. @@ -52,7 +52,7 @@ def read_values(filename, cutoff=0, lang=None): for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) - if val < cutoff: + if val < cutoff or len(values) >= max_size: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: @@ -76,7 +76,7 @@ def read_freqs(filename, cutoff=0, lang=None): If lang is given, read_freqs will apply language specific preprocessing operations. """ - values, total = read_values(filename, cutoff, lang) + values, total = read_values(filename, cutoff, lang=lang) for word in values: values[word] /= total