add word frequencies from the Reddit 2007-2015 corpus

Former-commit-id: b2d7546d2d
This commit is contained in:
Robyn Speer 2015-11-30 16:38:11 -05:00
parent eb08c0a951
commit 6d2709f064
6 changed files with 36 additions and 6 deletions

BIN
wordfreq_builder/lib/jq-linux64 Executable file

Binary file not shown.

View File

@ -13,7 +13,7 @@
# contains the programatically-defined dependency graph.
# Variables
DATA = ./data
JQ = lib/jq-linux64
# How to build the build.ninja file itself. (Use the Makefile to get it the
# first time.)
@ -99,3 +99,6 @@ rule freqs2cB
rule cat
command = cat $in > $out
rule extract_reddit
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out

View File

@ -5,7 +5,7 @@ import argparse
def merge_lists(input_names, output_name):
count_dicts = []
for input_name in input_names:
values, total = read_values(input_name, cutoff=0)
values, total = read_values(input_name, cutoff=0, max_size=1000000)
count_dicts.append(values)
merged = merge_counts(count_dicts)
write_wordlist(merged, output_name)

View File

@ -40,7 +40,8 @@ CONFIG = {
],
'subtlex-en': ['en'],
'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh']
'jieba': ['zh'],
'reddit': ['en'],
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {
@ -52,6 +53,7 @@ CONFIG = {
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}',

View File

@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
data_filename('source-lists/subtlex'),
CONFIG['sources']['subtlex-other']
),
reddit_deps(
data_filename('raw-input/reddit'),
CONFIG['sources']['reddit']
),
jieba_deps(
data_filename('source-lists/jieba'),
CONFIG['sources']['jieba']
@ -232,6 +236,27 @@ def jieba_deps(dirname_in, languages):
return lines
def reddit_deps(dirname_in, languages):
lines = []
if not languages:
return lines
assert languages == ['en']
processed_files = []
path_in = pathlib.Path(dirname_in)
for filepath in path_in.glob('*/*.bz2'):
base = filepath.name[:-4]
transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
add_dep(lines, 'count', transformed_file, count_file)
processed_files.append(count_file)
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
add_dep(lines, 'merge_counts', processed_files, output_file)
return lines
# Which columns of the SUBTLEX data files do the word and its frequency appear
# in?
SUBTLEX_COLUMN_MAP = {

View File

@ -33,7 +33,7 @@ def count_tokens(filename):
return counts
def read_values(filename, cutoff=0, lang=None):
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
"""
Read words and their frequency or count values from a CSV file. Returns
a dictionary of values and the total of all values.
@ -52,7 +52,7 @@ def read_values(filename, cutoff=0, lang=None):
for key, strval in csv.reader(infile):
val = float(strval)
key = fix_text(key)
if val < cutoff:
if val < cutoff or len(values) >= max_size:
break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens:
@ -76,7 +76,7 @@ def read_freqs(filename, cutoff=0, lang=None):
If lang is given, read_freqs will apply language specific preprocessing
operations.
"""
values, total = read_values(filename, cutoff, lang)
values, total = read_values(filename, cutoff, lang=lang)
for word in values:
values[word] /= total