Merge pull request #30 from LuminosoInsight/add-reddit

Add English data from Reddit corpus

Former-commit-id: d18fee3d78
This commit is contained in:
slibs63 2016-01-14 15:52:39 -05:00
commit 927d4f45a4
46 changed files with 121 additions and 32 deletions

View File

@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='1.2',
version='1.3',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
wordfreq_builder/lib/jq-linux64 Executable file

Binary file not shown.

View File

@ -13,7 +13,7 @@
# contains the programatically-defined dependency graph.
# Variables
DATA = ./data
JQ = lib/jq-linux64
# How to build the build.ninja file itself. (Use the Makefile to get it the
# first time.)
@ -92,10 +92,13 @@ rule merge
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
rule cat
command = cat $in > $out
rule extract_reddit
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out

View File

@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
import argparse
def merge_lists(input_names, output_name):
def merge_lists(input_names, output_name, cutoff=0):
count_dicts = []
for input_name in input_names:
values, total = read_values(input_name, cutoff=0)
values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
count_dicts.append(values)
merged = merge_counts(count_dicts)
write_wordlist(merged, output_name)
@ -13,8 +13,12 @@ def merge_lists(input_names, output_name):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
parser.add_argument('-o', '--output', default='combined-counts.csv',
help='filename to write the output to')
parser.add_argument('-c', '--cutoff', type=int, default=0
help='minimum count to read from an input file')
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args()
merge_lists(args.inputs, args.output)
merge_lists(args.inputs, args.output, cutoff=args.cutoff)

View File

@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
parser.add_argument('-o', '--output', default='combined-freqs.csv',
help='filename to write the output to')
parser.add_argument('-c', '--cutoff', type=int, default=2,
help='stop after seeing a count below this')
parser.add_argument('-l', '--language', default=None,
help='language code for which language the words are in')
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args()
merge_lists(args.inputs, args.output, args.cutoff, args.language)

View File

@ -0,0 +1,14 @@
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one comment per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
if __name__ == '__main__':
main()

View File

@ -1,4 +1,4 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
import argparse
@ -7,7 +7,7 @@ def main():
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
if __name__ == '__main__':

View File

@ -40,7 +40,8 @@ CONFIG = {
],
'subtlex-en': ['en'],
'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh']
'jieba': ['zh'],
'reddit': ['en'],
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {
@ -52,6 +53,7 @@ CONFIG = {
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}',

View File

@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
data_filename('source-lists/subtlex'),
CONFIG['sources']['subtlex-other']
),
reddit_deps(
data_filename('raw-input/reddit'),
CONFIG['sources']['reddit']
),
jieba_deps(
data_filename('source-lists/jieba'),
CONFIG['sources']['jieba']
@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages):
return lines
def reddit_deps(dirname_in, languages):
lines = []
if not languages:
return lines
assert languages == ['en']
processed_files = []
path_in = pathlib.Path(dirname_in)
for filepath in path_in.glob('*/*.bz2'):
base = filepath.name[:-4]
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
add_dep(lines, 'count', transformed_file, count_file)
processed_files.append(count_file)
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 3}
)
return lines
# Which columns of the SUBTLEX data files do the word and its frequency appear
# in?
SUBTLEX_COLUMN_MAP = {
@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
)
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
add_dep(lines, 'merge_counts', processed_files, output_file)
add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 0}
)
return lines
@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
)
add_dep(
lines, 'merge_counts', processed_file, output_file
lines, 'merge_counts', processed_file, output_file,
params={'cutoff': 0}
)
return lines

View File

@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
def cld2_surface_tokenizer(text):
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
lang = cld2_detect_language(text)
# Don't allow tokenization in Chinese when language-detecting, because
@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text):
return lang, tokens
# Low-frequency languages tend to be detected incorrectly. Keep a limited
# list of languages we're allowed to use here.
KEEP_THESE_LANGUAGES = {
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
}
def cld2_reddit_tokenizer(text):
"""
A language-detecting tokenizer with special cases for handling text from
Reddit.
"""
text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
lang = cld2_detect_language(text)
if lang not in KEEP_THESE_LANGUAGES:
# Reddit is 99.9% English, so if we detected a rare language, it's
# much more likely that it's actually English.
lang = 'en'
tokens = tokenize(text, lang, include_punctuation=True)
return lang, tokens
def cld2_detect_language(text):
"""
Uses CLD2 to detect the language.
@ -59,13 +88,11 @@ def cld2_detect_language(text):
return pycld2.detect(text)[2][0][1]
def tokenize_twitter(in_filename, out_prefix, tokenizer):
def tokenize_by_language(in_filename, out_prefix, tokenizer):
"""
Process a file by running it through the Twitter-specific tokenizer,
which uses cld2 for language detection, and removes Twitter handles
and t.co URLs.
Process a file by running it through a given tokenizer.
Produces output files that are separated by language, with newlines
Produces output files that are separated by language, with spaces
between the tokens.
"""
out_files = {}
@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text)
if language != 'un':
tokenized = '\n'.join(tokens)
tokenized = ' '.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files:
out_file = out_files[out_filename]

View File

@ -24,16 +24,19 @@ def count_tokens(filename):
containing '<EFBFBD>'.
"""
counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile:
if filename.endswith('gz'):
infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
else:
infile = open(filename, encoding='utf-8', errors='replace')
for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line):
counts[token] += 1
infile.close()
return counts
def read_values(filename, cutoff=0, lang=None):
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
"""
Read words and their frequency or count values from a CSV file. Returns
a dictionary of values and the total of all values.
@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None):
for key, strval in csv.reader(infile):
val = float(strval)
key = fix_text(key)
if val < cutoff:
if val < cutoff or len(values) >= max_size:
break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens:
@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None):
If lang is given, read_freqs will apply language specific preprocessing
operations.
"""
values, total = read_values(filename, cutoff, lang)
values, total = read_values(filename, cutoff, lang=lang)
for word in values:
values[word] /= total