actually use the results of language-detection on Reddit

This commit is contained in:
Rob Speer 2016-03-24 16:27:24 -04:00
parent 164a5b1a05
commit 75a4a92110
6 changed files with 113 additions and 58 deletions

View File

@ -46,6 +46,9 @@ rule simplify_chinese
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
rule tokenize_reddit
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
# To convert the Leeds corpus, look for space-separated lines that start with
# an integer and a decimal. The integer is the rank, which we discard. The
# decimal is the frequency, and the remaining text is the term. Use sed -n
@ -101,4 +104,4 @@ rule cat
command = cat $in > $out
rule extract_reddit
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out

View File

@ -2,12 +2,12 @@ from setuptools import setup
setup(
name="wordfreq_builder",
version='0.1',
version='0.2',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq_builder',
platforms=["any"],
description="Turns raw data into word frequency lists",
packages=['wordfreq_builder'],
install_requires=['msgpack-python', 'pycld2']
install_requires=['msgpack-python', 'pycld2', 'langcodes']
)

View File

@ -1,13 +1,17 @@
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
import argparse
def reddit_tokenizer(text):
return cld2_surface_tokenizer(text, mode='reddit')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one comment per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer)
if __name__ == '__main__':

View File

@ -41,7 +41,10 @@ CONFIG = {
'subtlex-en': ['en'],
'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh'],
'reddit': ['en'],
'reddit': [
'ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'pl', 'pt', 'ro',
'ru', 'sv'
]
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {

View File

@ -4,6 +4,8 @@ from wordfreq_builder.config import (
import sys
import pathlib
import itertools
from collections import defaultdict
HEADER = """# This file is automatically generated. Do not edit it.
# You can change its behavior by editing wordfreq_builder/ninja.py,
@ -155,14 +157,12 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
for language in languages:
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
language_inputs = [
'{prefix}.{lang}.txt'.format(
prefix=slice_files[slicenum], lang=language
)
for slicenum in range(slices)
]
add_dep(lines, 'cat', language_inputs, combined_output)
count_file = wordlist_filename('twitter', language, 'counts.txt')
@ -236,27 +236,51 @@ def jieba_deps(dirname_in, languages):
return lines
def reddit_base_filename(path):
"""
Get the base name of a Reddit input file, without its path or extension.
"""
return path.name[:-4]
def reddit_deps(dirname_in, languages):
lines = []
if not languages:
return lines
assert languages == ['en']
processed_files = []
path_in = pathlib.Path(dirname_in)
for filepath in path_in.glob('*/*.bz2'):
base = filepath.name[:-4]
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
add_dep(lines, 'count', transformed_file, count_file)
processed_files.append(count_file)
slices = {}
counts_by_language = defaultdict(list)
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
add_dep(
lines, 'merge_counts', processed_files, output_file,
params={'cutoff': 3}
)
# Extract text from the Reddit comment dumps, and write them to
# .txt.gz files
for filepath in path_in.glob('*/*.bz2'):
base = reddit_base_filename(filepath)
transformed_file = wordlist_filename('reddit', base + '.all', '.txt')
slices[base] = transformed_file
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
for base in sorted(slices):
transformed_file = slices[base]
language_outputs = []
for language in languages:
filename = wordlist_filename('reddit', base + '.' + language, '.txt')
language_outputs.append(filename)
count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt')
add_dep(lines, 'count', filename, count_filename)
counts_by_language[language].append(count_filename)
# find the prefix by constructing a filename, then stripping off
# '.xx.txt' from the end
prefix = wordlist_filename('reddit', base + '.xx', '.txt')[:-7]
add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs,
params={'prefix': prefix},
extra='wordfreq_builder/tokenizers.py')
for language in languages:
output_file = wordlist_filename('reddit', language, 'counts.txt')
add_dep(
lines, 'merge_counts', counts_by_language[language], output_file,
params={'cutoff': 3}
)
return lines

View File

@ -2,6 +2,7 @@ from wordfreq import tokenize
from ftfy.fixes import unescape_html
import regex
import pycld2
import langcodes
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
[
@ -26,48 +27,63 @@ URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
def cld2_surface_tokenizer(text):
"""
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
"""
text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
# Low-frequency languages tend to be detected incorrectly by cld2. The
# following list of languages are languages that appear in our data with any
# reasonable frequency, and seem to usually be detected *correctly*. These are
# the languages we'll keep in the Reddit and Twitter results.
#
# This list is larger than the list that wordfreq ultimately generates, so we
# can look here as a source of future data.
lang = cld2_detect_language(text)
# Don't allow tokenization in Chinese when language-detecting, because
# the Chinese tokenizer may not be built yet
if lang == 'zh':
lang = 'en'
tokens = tokenize(text, lang)
return lang, tokens
# Low-frequency languages tend to be detected incorrectly. Keep a limited
# list of languages we're allowed to use here.
KEEP_THESE_LANGUAGES = {
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi',
'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv',
'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl',
'tr', 'uk', 'vi'
}
# Semi-frequent languages that are excluded by the above:
#
# - Chinese, not because it's detected incorrectly, but because we can't
# handle it until we already have word frequencies
# - Thai (seems to be detected whenever someone uses Thai characters in
# an emoticon)
# - Welsh (which is detected for "ohmygodohmygodohmygod")
# - Turkmen (detected for ASCII art)
# - Irish Gaelic (detected for Cthulhu-related text)
# - Kannada (looks of disapproval)
# - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes)
# - Breton (the word "memes" itself)
def cld2_reddit_tokenizer(text):
def cld2_surface_tokenizer(text, mode='twitter'):
"""
A language-detecting tokenizer with special cases for handling text from
Reddit.
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
The `mode` can be 'twitter' or 'reddit', which slightly changes the
pre-processing of the text.
"""
text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
text = unescape_html(text)
if mode == 'twitter':
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
elif mode == 'reddit':
text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
lang = cld2_detect_language(text)
if lang not in KEEP_THESE_LANGUAGES:
# Reddit is 99.9% English, so if we detected a rare language, it's
# much more likely that it's actually English.
lang = 'en'
tokens = tokenize(text, lang, include_punctuation=True)
# If the detected language isn't in our pretty generous list of languages,
# return no tokens.
if lang not in KEEP_THESE_LANGUAGES:
return 'xx', []
# cld2's accuracy seems to improve dramatically with at least 50
# bytes of input, so throw away non-English below this length.
if len(text.encode('utf-8')) < 50 and lang != 'en':
return 'xx', []
tokens = tokenize(text, lang)
return lang, tokens
@ -85,7 +101,12 @@ def cld2_detect_language(text):
# Confidence score: float))
text = CLD2_BAD_CHARS_RE.sub('', text)
return pycld2.detect(text)[2][0][1]
lang = pycld2.detect(text)[2][0][1]
# Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
# becomes 'zh'
code = langcodes.get(lang).language
return code
def tokenize_by_language(in_filename, out_prefix, tokenizer):