actually use the results of language-detection on Reddit

2024-12-23 09:21:37 +00:00 · 2016-03-24 16:27:24 -04:00 · 2016-03-24 16:27:24 -04:00 · 75a4a92110
commit 75a4a92110
parent 164a5b1a05
6 changed files with 113 additions and 58 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -46,6 +46,9 @@ rule simplify_chinese
 rule tokenize_twitter
  command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
 rule tokenize_reddit
  command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
 # To convert the Leeds corpus, look for space-separated lines that start with
 # an integer and a decimal. The integer is the rank, which we discard. The
 # decimal is the frequency, and the remaining text is the term. Use sed -n
@ -101,4 +104,4 @@ rule cat
  command = cat $in > $out
 rule extract_reddit
-  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out
+  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -2,12 +2,12 @@ from setuptools import setup
 setup(
    name="wordfreq_builder",
-    version='0.1',
+    version='0.2',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq_builder',
    platforms=["any"],
    description="Turns raw data into word frequency lists",
    packages=['wordfreq_builder'],
-    install_requires=['msgpack-python', 'pycld2']
+    install_requires=['msgpack-python', 'pycld2', 'langcodes']
 )
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
@ -1,13 +1,17 @@
-from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
 import argparse
 def reddit_tokenizer(text):
    return cld2_surface_tokenizer(text, mode='reddit')
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='filename of input file containing one comment per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
-    tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
+    tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer)
 if __name__ == '__main__':
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -41,7 +41,10 @@ CONFIG = {
        'subtlex-en': ['en'],
        'subtlex-other': ['de', 'nl', 'zh'],
        'jieba': ['zh'],
-        'reddit': ['en'],
+        'reddit': [
            'ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'pl', 'pt', 'ro',
            'ru', 'sv'
        ]
    },
    # Subtlex languages that need to be pre-processed
    'wordlist_paths': {
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -4,6 +4,8 @@ from wordfreq_builder.config import (
 import sys
 import pathlib
 import itertools
 from collections import defaultdict
 HEADER = """# This file is automatically generated. Do not edit it.
 # You can change its behavior by editing wordfreq_builder/ninja.py,
@ -155,14 +157,12 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
    for language in languages:
        combined_output = wordlist_filename('twitter', language, 'tokens.txt')
        language_inputs = [
            '{prefix}.{lang}.txt'.format(
                prefix=slice_files[slicenum], lang=language
            )
            for slicenum in range(slices)
        ]
        add_dep(lines, 'cat', language_inputs, combined_output)
        count_file = wordlist_filename('twitter', language, 'counts.txt')
@ -236,27 +236,51 @@ def jieba_deps(dirname_in, languages):
    return lines
 def reddit_base_filename(path):
    """
    Get the base name of a Reddit input file, without its path or extension.
    """
    return path.name[:-4]
 def reddit_deps(dirname_in, languages):
    lines = []
    if not languages:
        return lines
    assert languages == ['en']
    processed_files = []
    path_in = pathlib.Path(dirname_in)
-    for filepath in path_in.glob('*/*.bz2'):
+    slices = {}
-        base = filepath.name[:-4]
+    counts_by_language = defaultdict(list)
        transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
        count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
        add_dep(lines, 'count', transformed_file, count_file)
        processed_files.append(count_file)
-    output_file = wordlist_filename('reddit', 'en', 'counts.txt')
+    # Extract text from the Reddit comment dumps, and write them to
-    add_dep(
+    # .txt.gz files
-        lines, 'merge_counts', processed_files, output_file,
+    for filepath in path_in.glob('*/*.bz2'):
-        params={'cutoff': 3}
+        base = reddit_base_filename(filepath)
-    )
+        transformed_file = wordlist_filename('reddit', base + '.all', '.txt')
        slices[base] = transformed_file
        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
    for base in sorted(slices):
        transformed_file = slices[base]
        language_outputs = []
        for language in languages:
            filename = wordlist_filename('reddit', base + '.' + language, '.txt')
            language_outputs.append(filename)
            count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt')
            add_dep(lines, 'count', filename, count_filename)
            counts_by_language[language].append(count_filename)
        # find the prefix by constructing a filename, then stripping off
        # '.xx.txt' from the end
        prefix = wordlist_filename('reddit', base + '.xx', '.txt')[:-7]
        add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs,
                params={'prefix': prefix},
                extra='wordfreq_builder/tokenizers.py')
    for language in languages:
        output_file = wordlist_filename('reddit', language, 'counts.txt')
        add_dep(
            lines, 'merge_counts', counts_by_language[language], output_file,
            params={'cutoff': 3}
        )
    return lines
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -2,6 +2,7 @@ from wordfreq import tokenize
 from ftfy.fixes import unescape_html
 import regex
 import pycld2
 import langcodes
 CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
    [
@ -26,48 +27,63 @@ URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
 MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
-def cld2_surface_tokenizer(text):
+# Low-frequency languages tend to be detected incorrectly by cld2. The
-    """
+# following list of languages are languages that appear in our data with any
-    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
+# reasonable frequency, and seem to usually be detected *correctly*. These are
-    """
+# the languages we'll keep in the Reddit and Twitter results.
-    text = unescape_html(text)
+#
-    text = TWITTER_HANDLE_RE.sub('', text)
+# This list is larger than the list that wordfreq ultimately generates, so we
-    text = TCO_RE.sub('', text)
+# can look here as a source of future data.
    lang = cld2_detect_language(text)
    # Don't allow tokenization in Chinese when language-detecting, because
    # the Chinese tokenizer may not be built yet
    if lang == 'zh':
        lang = 'en'
    tokens = tokenize(text, lang)
    return lang, tokens
 # Low-frequency languages tend to be detected incorrectly. Keep a limited
 # list of languages we're allowed to use here.
 KEEP_THESE_LANGUAGES = {
-    'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
+    'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi',
-    'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
+    'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv',
    'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl',
    'tr', 'uk', 'vi'
 }
 # Semi-frequent languages that are excluded by the above:
 #
 #   - Chinese, not because it's detected incorrectly, but because we can't
 #     handle it until we already have word frequencies
 #   - Thai (seems to be detected whenever someone uses Thai characters in
 #     an emoticon)
 #   - Welsh (which is detected for "ohmygodohmygodohmygod")
 #   - Turkmen (detected for ASCII art)
 #   - Irish Gaelic (detected for Cthulhu-related text)
 #   - Kannada (looks of disapproval)
 #   - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes)
 #   - Breton (the word "memes" itself)
-def cld2_reddit_tokenizer(text):
+
 def cld2_surface_tokenizer(text, mode='twitter'):
    """
-    A language-detecting tokenizer with special cases for handling text from
+    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
-    Reddit.
+
    The `mode` can be 'twitter' or 'reddit', which slightly changes the
    pre-processing of the text.
    """
-    text = URL_RE.sub('', text)
+    text = unescape_html(text)
-    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
+    if mode == 'twitter':
        text = TWITTER_HANDLE_RE.sub('', text)
        text = TCO_RE.sub('', text)
    elif mode == 'reddit':
        text = URL_RE.sub('', text)
        text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
    lang = cld2_detect_language(text)
    if lang not in KEEP_THESE_LANGUAGES:
        # Reddit is 99.9% English, so if we detected a rare language, it's
        # much more likely that it's actually English.
        lang = 'en'
-    tokens = tokenize(text, lang, include_punctuation=True)
+    # If the detected language isn't in our pretty generous list of languages,
    # return no tokens.
    if lang not in KEEP_THESE_LANGUAGES:
        return 'xx', []
    # cld2's accuracy seems to improve dramatically with at least 50
    # bytes of input, so throw away non-English below this length.
    if len(text.encode('utf-8')) < 50 and lang != 'en':
        return 'xx', []
    tokens = tokenize(text, lang)
    return lang, tokens
@ -85,7 +101,12 @@ def cld2_detect_language(text):
    #       Confidence score: float))
    text = CLD2_BAD_CHARS_RE.sub('', text)
-    return pycld2.detect(text)[2][0][1]
+    lang = pycld2.detect(text)[2][0][1]
    # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
    # becomes 'zh'
    code = langcodes.get(lang).language
    return code
 def tokenize_by_language(in_filename, out_prefix, tokenizer):