mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
add word frequencies from the Reddit 2007-2015 corpus
Former-commit-id: b2d7546d2d
This commit is contained in:
parent
eb08c0a951
commit
6d2709f064
BIN
wordfreq_builder/lib/jq-linux64
Executable file
BIN
wordfreq_builder/lib/jq-linux64
Executable file
Binary file not shown.
@ -13,7 +13,7 @@
|
|||||||
# contains the programatically-defined dependency graph.
|
# contains the programatically-defined dependency graph.
|
||||||
|
|
||||||
# Variables
|
# Variables
|
||||||
DATA = ./data
|
JQ = lib/jq-linux64
|
||||||
|
|
||||||
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
||||||
# first time.)
|
# first time.)
|
||||||
@ -99,3 +99,6 @@ rule freqs2cB
|
|||||||
|
|
||||||
rule cat
|
rule cat
|
||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
|
||||||
|
rule extract_reddit
|
||||||
|
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
||||||
|
@ -5,7 +5,7 @@ import argparse
|
|||||||
def merge_lists(input_names, output_name):
|
def merge_lists(input_names, output_name):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
values, total = read_values(input_name, cutoff=0)
|
values, total = read_values(input_name, cutoff=0, max_size=1000000)
|
||||||
count_dicts.append(values)
|
count_dicts.append(values)
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
|
@ -40,7 +40,8 @@ CONFIG = {
|
|||||||
],
|
],
|
||||||
'subtlex-en': ['en'],
|
'subtlex-en': ['en'],
|
||||||
'subtlex-other': ['de', 'nl', 'zh'],
|
'subtlex-other': ['de', 'nl', 'zh'],
|
||||||
'jieba': ['zh']
|
'jieba': ['zh'],
|
||||||
|
'reddit': ['en'],
|
||||||
},
|
},
|
||||||
# Subtlex languages that need to be pre-processed
|
# Subtlex languages that need to be pre-processed
|
||||||
'wordlist_paths': {
|
'wordlist_paths': {
|
||||||
@ -52,6 +53,7 @@ CONFIG = {
|
|||||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
||||||
|
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
|
||||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||||
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
||||||
|
@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
data_filename('source-lists/subtlex'),
|
data_filename('source-lists/subtlex'),
|
||||||
CONFIG['sources']['subtlex-other']
|
CONFIG['sources']['subtlex-other']
|
||||||
),
|
),
|
||||||
|
reddit_deps(
|
||||||
|
data_filename('raw-input/reddit'),
|
||||||
|
CONFIG['sources']['reddit']
|
||||||
|
),
|
||||||
jieba_deps(
|
jieba_deps(
|
||||||
data_filename('source-lists/jieba'),
|
data_filename('source-lists/jieba'),
|
||||||
CONFIG['sources']['jieba']
|
CONFIG['sources']['jieba']
|
||||||
@ -232,6 +236,27 @@ def jieba_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def reddit_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
if not languages:
|
||||||
|
return lines
|
||||||
|
assert languages == ['en']
|
||||||
|
|
||||||
|
processed_files = []
|
||||||
|
path_in = pathlib.Path(dirname_in)
|
||||||
|
for filepath in path_in.glob('*/*.bz2'):
|
||||||
|
base = filepath.name[:-4]
|
||||||
|
transformed_file = wordlist_filename('reddit', 'en', base + '.txt')
|
||||||
|
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||||
|
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||||
|
add_dep(lines, 'count', transformed_file, count_file)
|
||||||
|
processed_files.append(count_file)
|
||||||
|
|
||||||
|
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||||
|
add_dep(lines, 'merge_counts', processed_files, output_file)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||||
# in?
|
# in?
|
||||||
SUBTLEX_COLUMN_MAP = {
|
SUBTLEX_COLUMN_MAP = {
|
||||||
|
@ -33,7 +33,7 @@ def count_tokens(filename):
|
|||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def read_values(filename, cutoff=0, lang=None):
|
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
||||||
"""
|
"""
|
||||||
Read words and their frequency or count values from a CSV file. Returns
|
Read words and their frequency or count values from a CSV file. Returns
|
||||||
a dictionary of values and the total of all values.
|
a dictionary of values and the total of all values.
|
||||||
@ -52,7 +52,7 @@ def read_values(filename, cutoff=0, lang=None):
|
|||||||
for key, strval in csv.reader(infile):
|
for key, strval in csv.reader(infile):
|
||||||
val = float(strval)
|
val = float(strval)
|
||||||
key = fix_text(key)
|
key = fix_text(key)
|
||||||
if val < cutoff:
|
if val < cutoff or len(values) >= max_size:
|
||||||
break
|
break
|
||||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
@ -76,7 +76,7 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
If lang is given, read_freqs will apply language specific preprocessing
|
If lang is given, read_freqs will apply language specific preprocessing
|
||||||
operations.
|
operations.
|
||||||
"""
|
"""
|
||||||
values, total = read_values(filename, cutoff, lang)
|
values, total = read_values(filename, cutoff, lang=lang)
|
||||||
for word in values:
|
for word in values:
|
||||||
values[word] /= total
|
values[word] /= total
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user