Merge pull request #30 from LuminosoInsight/add-reddit

Add English data from Reddit corpus

Former-commit-id: d18fee3d78
This commit is contained in:
slibs63 2016-01-14 15:52:39 -05:00
commit 927d4f45a4
46 changed files with 121 additions and 32 deletions

View File

@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
setup( setup(
name="wordfreq", name="wordfreq",
version='1.2', version='1.3',
maintainer='Luminoso Technologies, Inc.', maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com', maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
wordfreq_builder/lib/jq-linux64 Executable file

Binary file not shown.

View File

@ -13,7 +13,7 @@
# contains the programatically-defined dependency graph. # contains the programatically-defined dependency graph.
# Variables # Variables
DATA = ./data JQ = lib/jq-linux64
# How to build the build.ninja file itself. (Use the Makefile to get it the # How to build the build.ninja file itself. (Use the Makefile to get it the
# first time.) # first time.)
@ -92,10 +92,13 @@ rule merge
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
rule merge_counts rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out $in command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
rule freqs2cB rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
rule cat rule cat
command = cat $in > $out command = cat $in > $out
rule extract_reddit
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out

View File

@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
import argparse import argparse
def merge_lists(input_names, output_name): def merge_lists(input_names, output_name, cutoff=0):
count_dicts = [] count_dicts = []
for input_name in input_names: for input_name in input_names:
values, total = read_values(input_name, cutoff=0) values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
count_dicts.append(values) count_dicts.append(values)
merged = merge_counts(count_dicts) merged = merge_counts(count_dicts)
write_wordlist(merged, output_name) write_wordlist(merged, output_name)
@ -13,8 +13,12 @@ def merge_lists(input_names, output_name):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') parser.add_argument('-o', '--output', default='combined-counts.csv',
parser.add_argument('inputs', help='names of input files to merge', nargs='+') help='filename to write the output to')
parser.add_argument('-c', '--cutoff', type=int, default=0
help='minimum count to read from an input file')
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output) merge_lists(args.inputs, args.output, cutoff=args.cutoff)

View File

@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') parser.add_argument('-o', '--output', default='combined-freqs.csv',
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) help='filename to write the output to')
parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None) parser.add_argument('-c', '--cutoff', type=int, default=2,
parser.add_argument('inputs', help='names of input files to merge', nargs='+') help='stop after seeing a count below this')
parser.add_argument('-l', '--language', default=None,
help='language code for which language the words are in')
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output, args.cutoff, args.language) merge_lists(args.inputs, args.output, args.cutoff, args.language)

View File

@ -0,0 +1,14 @@
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one comment per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
if __name__ == '__main__':
main()

View File

@ -1,4 +1,4 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
import argparse import argparse
@ -7,7 +7,7 @@ def main():
parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames') parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args() args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -40,7 +40,8 @@ CONFIG = {
], ],
'subtlex-en': ['en'], 'subtlex-en': ['en'],
'subtlex-other': ['de', 'nl', 'zh'], 'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh'] 'jieba': ['zh'],
'reddit': ['en'],
}, },
# Subtlex languages that need to be pre-processed # Subtlex languages that need to be pre-processed
'wordlist_paths': { 'wordlist_paths': {
@ -52,6 +53,7 @@ CONFIG = {
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}', 'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
'jieba': 'generated/jieba/jieba_{lang}.{ext}', 'jieba': 'generated/jieba/jieba_{lang}.{ext}',
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}',

View File

@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
data_filename('source-lists/subtlex'), data_filename('source-lists/subtlex'),
CONFIG['sources']['subtlex-other'] CONFIG['sources']['subtlex-other']
), ),
reddit_deps(
data_filename('raw-input/reddit'),
CONFIG['sources']['reddit']
),
jieba_deps( jieba_deps(
data_filename('source-lists/jieba'), data_filename('source-lists/jieba'),
CONFIG['sources']['jieba'] CONFIG['sources']['jieba']
@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages):
return lines return lines
def reddit_deps(dirname_in, languages):
lines = []
if not languages:
return lines
assert languages == ['en']
processed_files = []
path_in = pathlib.Path(dirname_in)
for filepath in path_in.glob('*/*.bz2'):
base = filepath.name[:-4]
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
add_dep(lines, 'count', transformed_file, count_file)
processed_files.append(count_file)
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 3}
)
return lines
# Which columns of the SUBTLEX data files do the word and its frequency appear # Which columns of the SUBTLEX data files do the word and its frequency appear
# in? # in?
SUBTLEX_COLUMN_MAP = { SUBTLEX_COLUMN_MAP = {
@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
) )
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
add_dep(lines, 'merge_counts', processed_files, output_file) add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 0}
)
return lines return lines
@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
) )
add_dep( add_dep(
lines, 'merge_counts', processed_file, output_file lines, 'merge_counts', processed_file, output_file,
params={'cutoff': 0}
) )
return lines return lines

View File

@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+') TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
def cld2_surface_tokenizer(text): def cld2_surface_tokenizer(text):
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
text = unescape_html(text) text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text) text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text) text = TCO_RE.sub('', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
# Don't allow tokenization in Chinese when language-detecting, because # Don't allow tokenization in Chinese when language-detecting, because
@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text):
return lang, tokens return lang, tokens
# Low-frequency languages tend to be detected incorrectly. Keep a limited
# list of languages we're allowed to use here.
KEEP_THESE_LANGUAGES = {
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
}
def cld2_reddit_tokenizer(text):
"""
A language-detecting tokenizer with special cases for handling text from
Reddit.
"""
text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
lang = cld2_detect_language(text)
if lang not in KEEP_THESE_LANGUAGES:
# Reddit is 99.9% English, so if we detected a rare language, it's
# much more likely that it's actually English.
lang = 'en'
tokens = tokenize(text, lang, include_punctuation=True)
return lang, tokens
def cld2_detect_language(text): def cld2_detect_language(text):
""" """
Uses CLD2 to detect the language. Uses CLD2 to detect the language.
@ -59,13 +88,11 @@ def cld2_detect_language(text):
return pycld2.detect(text)[2][0][1] return pycld2.detect(text)[2][0][1]
def tokenize_twitter(in_filename, out_prefix, tokenizer): def tokenize_by_language(in_filename, out_prefix, tokenizer):
""" """
Process a file by running it through the Twitter-specific tokenizer, Process a file by running it through a given tokenizer.
which uses cld2 for language detection, and removes Twitter handles
and t.co URLs.
Produces output files that are separated by language, with newlines Produces output files that are separated by language, with spaces
between the tokens. between the tokens.
""" """
out_files = {} out_files = {}
@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
text = line.split('\t')[-1].strip() text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text) language, tokens = tokenizer(text)
if language != 'un': if language != 'un':
tokenized = '\n'.join(tokens) tokenized = ' '.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language) out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files: if out_filename in out_files:
out_file = out_files[out_filename] out_file = out_files[out_filename]

View File

@ -24,16 +24,19 @@ def count_tokens(filename):
containing '<EFBFBD>'. containing '<EFBFBD>'.
""" """
counts = defaultdict(int) counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile: if filename.endswith('gz'):
for line in infile: infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
line = URL_RE.sub('', line.strip()) else:
for token in simple_tokenize(line): infile = open(filename, encoding='utf-8', errors='replace')
counts[token] += 1 for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line):
counts[token] += 1
infile.close()
return counts return counts
def read_values(filename, cutoff=0, lang=None): def read_values(filename, cutoff=0, max_size=1e8, lang=None):
""" """
Read words and their frequency or count values from a CSV file. Returns Read words and their frequency or count values from a CSV file. Returns
a dictionary of values and the total of all values. a dictionary of values and the total of all values.
@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None):
for key, strval in csv.reader(infile): for key, strval in csv.reader(infile):
val = float(strval) val = float(strval)
key = fix_text(key) key = fix_text(key)
if val < cutoff: if val < cutoff or len(values) >= max_size:
break break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens: for token in tokens:
@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None):
If lang is given, read_freqs will apply language specific preprocessing If lang is given, read_freqs will apply language specific preprocessing
operations. operations.
""" """
values, total = read_values(filename, cutoff, lang) values, total = read_values(filename, cutoff, lang=lang)
for word in values: for word in values:
values[word] /= total values[word] /= total