add word frequencies from the Reddit 2007-2015 corpus

2024-12-23 09:21:37 +00:00 · 2015-11-30 16:38:11 -05:00 · 2015-11-30 16:38:11 -05:00 · b2d7546d2d
commit b2d7546d2d
parent e1f7a1ccf3
6 changed files with 36 additions and 6 deletions
--- a/wordfreq_builder/lib/jq-linux64
+++ b/wordfreq_builder/lib/jq-linux64
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -13,7 +13,7 @@
 # contains the programatically-defined dependency graph.

 # Variables
-DATA = ./data
+JQ = lib/jq-linux64

 # How to build the build.ninja file itself. (Use the Makefile to get it the
 # first time.)
@ -99,3 +99,6 @@ rule freqs2cB

 rule cat
  command = cat $in > $out
+
+rule extract_reddit
+  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@ -5,7 +5,7 @@ import argparse
 def merge_lists(input_names, output_name):
    count_dicts = []
    for input_name in input_names:
-        values, total = read_values(input_name, cutoff=0)
+        values, total = read_values(input_name, cutoff=0, max_size=1000000)
        count_dicts.append(values)
    merged = merge_counts(count_dicts)
    write_wordlist(merged, output_name)
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -40,7 +40,8 @@ CONFIG = {
        ],
        'subtlex-en': ['en'],
        'subtlex-other': ['de', 'nl', 'zh'],
-        'jieba': ['zh']
+        'jieba': ['zh'],
+        'reddit': ['en'],
    },
    # Subtlex languages that need to be pre-processed
    'wordlist_paths': {
@ -52,6 +53,7 @@ CONFIG = {
        'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
        'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
        'jieba': 'generated/jieba/jieba_{lang}.{ext}',
+        'reddit': 'generated/reddit/reddit_{lang}.{ext}',
        'combined': 'generated/combined/combined_{lang}.{ext}',
        'combined-dist': 'dist/combined_{lang}.{ext}',
        'twitter-dist': 'dist/twitter_{lang}.{ext}',
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
            data_filename('source-lists/subtlex'),
            CONFIG['sources']['subtlex-other']
        ),
+        reddit_deps(
+            data_filename('raw-input/reddit'),
+            CONFIG['sources']['reddit']
+        ),
        jieba_deps(
            data_filename('source-lists/jieba'),
            CONFIG['sources']['jieba']
@ -232,6 +236,27 @@ def jieba_deps(dirname_in, languages):
    return lines


+def reddit_deps(dirname_in, languages):
+    lines = []
+    if not languages:
+        return lines
+    assert languages == ['en']
+
+    processed_files = []
+    path_in = pathlib.Path(dirname_in)
+    for filepath in path_in.glob('*/*.bz2'):
+        base = filepath.name[:-4]
+        transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
+        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
+        count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
+        add_dep(lines, 'count', transformed_file, count_file)
+        processed_files.append(count_file)
+
+    output_file = wordlist_filename('reddit', 'en', 'counts.txt')
+    add_dep(lines, 'merge_counts', processed_files, output_file)
+    return lines
+
+
 # Which columns of the SUBTLEX data files do the word and its frequency appear
 # in?
 SUBTLEX_COLUMN_MAP = {
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -33,7 +33,7 @@ def count_tokens(filename):
    return counts


-def read_values(filename, cutoff=0, lang=None):
+def read_values(filename, cutoff=0, max_size=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.
@ -52,7 +52,7 @@ def read_values(filename, cutoff=0, lang=None):
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
-            if val < cutoff:
+            if val < cutoff or len(values) >= max_size:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
@ -76,7 +76,7 @@ def read_freqs(filename, cutoff=0, lang=None):
    If lang is given, read_freqs will apply language specific preprocessing
    operations.
    """
-    values, total = read_values(filename, cutoff, lang)
+    values, total = read_values(filename, cutoff, lang=lang)
    for word in values:
        values[word] /= total