mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
add word frequencies from the Reddit 2007-2015 corpus
Former-commit-id: b2d7546d2d
This commit is contained in:
parent
eb08c0a951
commit
6d2709f064
BIN
wordfreq_builder/lib/jq-linux64
Executable file
BIN
wordfreq_builder/lib/jq-linux64
Executable file
Binary file not shown.
@ -13,7 +13,7 @@
|
||||
# contains the programatically-defined dependency graph.
|
||||
|
||||
# Variables
|
||||
DATA = ./data
|
||||
JQ = lib/jq-linux64
|
||||
|
||||
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
||||
# first time.)
|
||||
@ -99,3 +99,6 @@ rule freqs2cB
|
||||
|
||||
rule cat
|
||||
command = cat $in > $out
|
||||
|
||||
rule extract_reddit
|
||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
||||
|
@ -5,7 +5,7 @@ import argparse
|
||||
def merge_lists(input_names, output_name):
|
||||
count_dicts = []
|
||||
for input_name in input_names:
|
||||
values, total = read_values(input_name, cutoff=0)
|
||||
values, total = read_values(input_name, cutoff=0, max_size=1000000)
|
||||
count_dicts.append(values)
|
||||
merged = merge_counts(count_dicts)
|
||||
write_wordlist(merged, output_name)
|
||||
|
@ -40,7 +40,8 @@ CONFIG = {
|
||||
],
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
'jieba': ['zh']
|
||||
'jieba': ['zh'],
|
||||
'reddit': ['en'],
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
@ -52,6 +53,7 @@ CONFIG = {
|
||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
||||
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
|
||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
||||
|
@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
data_filename('source-lists/subtlex'),
|
||||
CONFIG['sources']['subtlex-other']
|
||||
),
|
||||
reddit_deps(
|
||||
data_filename('raw-input/reddit'),
|
||||
CONFIG['sources']['reddit']
|
||||
),
|
||||
jieba_deps(
|
||||
data_filename('source-lists/jieba'),
|
||||
CONFIG['sources']['jieba']
|
||||
@ -232,6 +236,27 @@ def jieba_deps(dirname_in, languages):
|
||||
return lines
|
||||
|
||||
|
||||
def reddit_deps(dirname_in, languages):
|
||||
lines = []
|
||||
if not languages:
|
||||
return lines
|
||||
assert languages == ['en']
|
||||
|
||||
processed_files = []
|
||||
path_in = pathlib.Path(dirname_in)
|
||||
for filepath in path_in.glob('*/*.bz2'):
|
||||
base = filepath.name[:-4]
|
||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
|
||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||
add_dep(lines, 'count', transformed_file, count_file)
|
||||
processed_files.append(count_file)
|
||||
|
||||
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
||||
return lines
|
||||
|
||||
|
||||
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||
# in?
|
||||
SUBTLEX_COLUMN_MAP = {
|
||||
|
@ -33,7 +33,7 @@ def count_tokens(filename):
|
||||
return counts
|
||||
|
||||
|
||||
def read_values(filename, cutoff=0, lang=None):
|
||||
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
||||
"""
|
||||
Read words and their frequency or count values from a CSV file. Returns
|
||||
a dictionary of values and the total of all values.
|
||||
@ -52,7 +52,7 @@ def read_values(filename, cutoff=0, lang=None):
|
||||
for key, strval in csv.reader(infile):
|
||||
val = float(strval)
|
||||
key = fix_text(key)
|
||||
if val < cutoff:
|
||||
if val < cutoff or len(values) >= max_size:
|
||||
break
|
||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||
for token in tokens:
|
||||
@ -76,7 +76,7 @@ def read_freqs(filename, cutoff=0, lang=None):
|
||||
If lang is given, read_freqs will apply language specific preprocessing
|
||||
operations.
|
||||
"""
|
||||
values, total = read_values(filename, cutoff, lang)
|
||||
values, total = read_values(filename, cutoff, lang=lang)
|
||||
for word in values:
|
||||
values[word] /= total
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user