diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index c05841a..0263c7b 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -46,6 +46,9 @@ rule simplify_chinese rule tokenize_twitter command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix +rule tokenize_reddit + command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix + # To convert the Leeds corpus, look for space-separated lines that start with # an integer and a decimal. The integer is the rank, which we discard. The # decimal is the frequency, and the remaining text is the term. Use sed -n @@ -101,4 +104,4 @@ rule cat command = cat $in > $out rule extract_reddit - command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out + command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index 6f0a438..5b0f739 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -2,12 +2,12 @@ from setuptools import setup setup( name="wordfreq_builder", - version='0.1', + version='0.2', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq_builder', platforms=["any"], description="Turns raw data into word frequency lists", packages=['wordfreq_builder'], - install_requires=['msgpack-python', 'pycld2'] + install_requires=['msgpack-python', 'pycld2', 'langcodes'] ) diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py index 6a275b3..829853c 100644 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py @@ -1,13 +1,17 @@ -from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language +from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language import argparse +def reddit_tokenizer(text): + return cld2_surface_tokenizer(text, mode='reddit') + + def main(): parser = argparse.ArgumentParser() parser.add_argument('filename', help='filename of input file containing one comment per line') parser.add_argument('outprefix', help='prefix of output filenames') args = parser.parse_args() - tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer) + tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer) if __name__ == '__main__': diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 92708bc..1ab743f 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -41,7 +41,10 @@ CONFIG = { 'subtlex-en': ['en'], 'subtlex-other': ['de', 'nl', 'zh'], 'jieba': ['zh'], - 'reddit': ['en'], + 'reddit': [ + 'ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'pl', 'pt', 'ro', + 'ru', 'sv' + ] }, # Subtlex languages that need to be pre-processed 'wordlist_paths': { diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 89910b2..6c72e81 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -4,6 +4,8 @@ from wordfreq_builder.config import ( import sys import pathlib import itertools +from collections import defaultdict + HEADER = """# This file is automatically generated. Do not edit it. # You can change its behavior by editing wordfreq_builder/ninja.py, @@ -155,14 +157,12 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, for language in languages: combined_output = wordlist_filename('twitter', language, 'tokens.txt') - language_inputs = [ '{prefix}.{lang}.txt'.format( prefix=slice_files[slicenum], lang=language ) for slicenum in range(slices) ] - add_dep(lines, 'cat', language_inputs, combined_output) count_file = wordlist_filename('twitter', language, 'counts.txt') @@ -236,27 +236,51 @@ def jieba_deps(dirname_in, languages): return lines +def reddit_base_filename(path): + """ + Get the base name of a Reddit input file, without its path or extension. + """ + return path.name[:-4] + + def reddit_deps(dirname_in, languages): lines = [] - if not languages: - return lines - assert languages == ['en'] - - processed_files = [] path_in = pathlib.Path(dirname_in) - for filepath in path_in.glob('*/*.bz2'): - base = filepath.name[:-4] - transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz') - add_dep(lines, 'extract_reddit', str(filepath), transformed_file) - count_file = wordlist_filename('reddit', 'en', base + '.counts.txt') - add_dep(lines, 'count', transformed_file, count_file) - processed_files.append(count_file) + slices = {} + counts_by_language = defaultdict(list) - output_file = wordlist_filename('reddit', 'en', 'counts.txt') - add_dep( - lines, 'merge_counts', processed_files, output_file, - params={'cutoff': 3} - ) + # Extract text from the Reddit comment dumps, and write them to + # .txt.gz files + for filepath in path_in.glob('*/*.bz2'): + base = reddit_base_filename(filepath) + transformed_file = wordlist_filename('reddit', base + '.all', '.txt') + slices[base] = transformed_file + add_dep(lines, 'extract_reddit', str(filepath), transformed_file) + + for base in sorted(slices): + transformed_file = slices[base] + language_outputs = [] + for language in languages: + filename = wordlist_filename('reddit', base + '.' + language, '.txt') + language_outputs.append(filename) + + count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt') + add_dep(lines, 'count', filename, count_filename) + counts_by_language[language].append(count_filename) + + # find the prefix by constructing a filename, then stripping off + # '.xx.txt' from the end + prefix = wordlist_filename('reddit', base + '.xx', '.txt')[:-7] + add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs, + params={'prefix': prefix}, + extra='wordfreq_builder/tokenizers.py') + + for language in languages: + output_file = wordlist_filename('reddit', language, 'counts.txt') + add_dep( + lines, 'merge_counts', counts_by_language[language], output_file, + params={'cutoff': 3} + ) return lines diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index b47e94a..cea3283 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -2,6 +2,7 @@ from wordfreq import tokenize from ftfy.fixes import unescape_html import regex import pycld2 +import langcodes CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( [ @@ -26,48 +27,63 @@ URL_RE = regex.compile(r'http(?:s)?://[^) ]*') MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)') -def cld2_surface_tokenizer(text): - """ - Uses CLD2 to detect the language and wordfreq tokenizer to create tokens. - """ - text = unescape_html(text) - text = TWITTER_HANDLE_RE.sub('', text) - text = TCO_RE.sub('', text) +# Low-frequency languages tend to be detected incorrectly by cld2. The +# following list of languages are languages that appear in our data with any +# reasonable frequency, and seem to usually be detected *correctly*. These are +# the languages we'll keep in the Reddit and Twitter results. +# +# This list is larger than the list that wordfreq ultimately generates, so we +# can look here as a source of future data. - lang = cld2_detect_language(text) - - # Don't allow tokenization in Chinese when language-detecting, because - # the Chinese tokenizer may not be built yet - if lang == 'zh': - lang = 'en' - - tokens = tokenize(text, lang) - return lang, tokens - - -# Low-frequency languages tend to be detected incorrectly. Keep a limited -# list of languages we're allowed to use here. KEEP_THESE_LANGUAGES = { - 'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms', - 'nl', 'pl', 'pt', 'ro', 'ru', 'sv' + 'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', + 'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv', + 'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl', + 'tr', 'uk', 'vi' } +# Semi-frequent languages that are excluded by the above: +# +# - Chinese, not because it's detected incorrectly, but because we can't +# handle it until we already have word frequencies +# - Thai (seems to be detected whenever someone uses Thai characters in +# an emoticon) +# - Welsh (which is detected for "ohmygodohmygodohmygod") +# - Turkmen (detected for ASCII art) +# - Irish Gaelic (detected for Cthulhu-related text) +# - Kannada (looks of disapproval) +# - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes) +# - Breton (the word "memes" itself) -def cld2_reddit_tokenizer(text): + +def cld2_surface_tokenizer(text, mode='twitter'): """ - A language-detecting tokenizer with special cases for handling text from - Reddit. + Uses CLD2 to detect the language and wordfreq tokenizer to create tokens. + + The `mode` can be 'twitter' or 'reddit', which slightly changes the + pre-processing of the text. """ - text = URL_RE.sub('', text) - text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) + text = unescape_html(text) + if mode == 'twitter': + text = TWITTER_HANDLE_RE.sub('', text) + text = TCO_RE.sub('', text) + elif mode == 'reddit': + text = URL_RE.sub('', text) + text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) lang = cld2_detect_language(text) - if lang not in KEEP_THESE_LANGUAGES: - # Reddit is 99.9% English, so if we detected a rare language, it's - # much more likely that it's actually English. - lang = 'en' - tokens = tokenize(text, lang, include_punctuation=True) + # If the detected language isn't in our pretty generous list of languages, + # return no tokens. + if lang not in KEEP_THESE_LANGUAGES: + return 'xx', [] + + # cld2's accuracy seems to improve dramatically with at least 50 + # bytes of input, so throw away non-English below this length. + if len(text.encode('utf-8')) < 50 and lang != 'en': + return 'xx', [] + + tokens = tokenize(text, lang) return lang, tokens @@ -85,7 +101,12 @@ def cld2_detect_language(text): # Confidence score: float)) text = CLD2_BAD_CHARS_RE.sub('', text) - return pycld2.detect(text)[2][0][1] + lang = pycld2.detect(text)[2][0][1] + + # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant' + # becomes 'zh' + code = langcodes.get(lang).language + return code def tokenize_by_language(in_filename, out_prefix, tokenizer):