actually use the results of language-detection on Reddit

Former-commit-id: 75a4a92110
2024-12-23 17:31:41 +00:00 · 2016-03-24 16:27:24 -04:00 · 2016-03-24 16:27:24 -04:00 · c3364ef821
commit c3364ef821
parent a5fcfd100d
6 changed files with 113 additions and 58 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -46,6 +46,9 @@ rule simplify_chinese
 rule tokenize_twitter
  command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix

+rule tokenize_reddit
+  command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
+
 # To convert the Leeds corpus, look for space-separated lines that start with
 # an integer and a decimal. The integer is the rank, which we discard. The
 # decimal is the frequency, and the remaining text is the term. Use sed -n
@ -101,4 +104,4 @@ rule cat
  command = cat $in > $out

 rule extract_reddit
-  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out
+  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' > $out
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -2,12 +2,12 @@ from setuptools import setup

 setup(
    name="wordfreq_builder",
-    version='0.1',
+    version='0.2',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq_builder',
    platforms=["any"],
    description="Turns raw data into word frequency lists",
    packages=['wordfreq_builder'],
-    install_requires=['msgpack-python', 'pycld2']
+    install_requires=['msgpack-python', 'pycld2', 'langcodes']
 )
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
@ -1,13 +1,17 @@
-from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
 import argparse


+def reddit_tokenizer(text):
+    return cld2_surface_tokenizer(text, mode='reddit')
+
+
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='filename of input file containing one comment per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
-    tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
+    tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer)


 if __name__ == '__main__':
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -41,7 +41,10 @@ CONFIG = {
        'subtlex-en': ['en'],
        'subtlex-other': ['de', 'nl', 'zh'],
        'jieba': ['zh'],
-        'reddit': ['en'],
+        'reddit': [
+            'ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'pl', 'pt', 'ro',
+            'ru', 'sv'
+        ]
    },
    # Subtlex languages that need to be pre-processed
    'wordlist_paths': {
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -4,6 +4,8 @@ from wordfreq_builder.config import (
 import sys
 import pathlib
 import itertools
+from collections import defaultdict
+

 HEADER = """# This file is automatically generated. Do not edit it.
 # You can change its behavior by editing wordfreq_builder/ninja.py,
@ -155,14 +157,12 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,

    for language in languages:
        combined_output = wordlist_filename('twitter', language, 'tokens.txt')
-
        language_inputs = [
            '{prefix}.{lang}.txt'.format(
                prefix=slice_files[slicenum], lang=language
            )
            for slicenum in range(slices)
        ]
-
        add_dep(lines, 'cat', language_inputs, combined_output)

        count_file = wordlist_filename('twitter', language, 'counts.txt')
@ -236,27 +236,51 @@ def jieba_deps(dirname_in, languages):
    return lines


+def reddit_base_filename(path):
+    """
+    Get the base name of a Reddit input file, without its path or extension.
+    """
+    return path.name[:-4]
+
+
 def reddit_deps(dirname_in, languages):
    lines = []
-    if not languages:
-        return lines
-    assert languages == ['en']
-
-    processed_files = []
    path_in = pathlib.Path(dirname_in)
-    for filepath in path_in.glob('*/*.bz2'):
-        base = filepath.name[:-4]
-        transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
-        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
-        count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
-        add_dep(lines, 'count', transformed_file, count_file)
-        processed_files.append(count_file)
+    slices = {}
+    counts_by_language = defaultdict(list)

-    output_file = wordlist_filename('reddit', 'en', 'counts.txt')
-    add_dep(
-        lines, 'merge_counts', processed_files, output_file,
-        params={'cutoff': 3}
-    )
+    # Extract text from the Reddit comment dumps, and write them to
+    # .txt.gz files
+    for filepath in path_in.glob('*/*.bz2'):
+        base = reddit_base_filename(filepath)
+        transformed_file = wordlist_filename('reddit', base + '.all', '.txt')
+        slices[base] = transformed_file
+        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
+
+    for base in sorted(slices):
+        transformed_file = slices[base]
+        language_outputs = []
+        for language in languages:
+            filename = wordlist_filename('reddit', base + '.' + language, '.txt')
+            language_outputs.append(filename)
+
+            count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt')
+            add_dep(lines, 'count', filename, count_filename)
+            counts_by_language[language].append(count_filename)
+
+        # find the prefix by constructing a filename, then stripping off
+        # '.xx.txt' from the end
+        prefix = wordlist_filename('reddit', base + '.xx', '.txt')[:-7]
+        add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs,
+                params={'prefix': prefix},
+                extra='wordfreq_builder/tokenizers.py')
+
+    for language in languages:
+        output_file = wordlist_filename('reddit', language, 'counts.txt')
+        add_dep(
+            lines, 'merge_counts', counts_by_language[language], output_file,
+            params={'cutoff': 3}
+        )
    return lines


--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -2,6 +2,7 @@ from wordfreq import tokenize
 from ftfy.fixes import unescape_html
 import regex
 import pycld2
+import langcodes

 CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
    [
@ -26,48 +27,63 @@ URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
 MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')


-def cld2_surface_tokenizer(text):
-    """
-    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
-    """
-    text = unescape_html(text)
-    text = TWITTER_HANDLE_RE.sub('', text)
-    text = TCO_RE.sub('', text)
+# Low-frequency languages tend to be detected incorrectly by cld2. The
+# following list of languages are languages that appear in our data with any
+# reasonable frequency, and seem to usually be detected *correctly*. These are
+# the languages we'll keep in the Reddit and Twitter results.
+#
+# This list is larger than the list that wordfreq ultimately generates, so we
+# can look here as a source of future data.

-    lang = cld2_detect_language(text)
-
-    # Don't allow tokenization in Chinese when language-detecting, because
-    # the Chinese tokenizer may not be built yet
-    if lang == 'zh':
-        lang = 'en'
-
-    tokens = tokenize(text, lang)
-    return lang, tokens
-
-
-# Low-frequency languages tend to be detected incorrectly. Keep a limited
-# list of languages we're allowed to use here.
 KEEP_THESE_LANGUAGES = {
-    'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
-    'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
+    'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi',
+    'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv',
+    'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl',
+    'tr', 'uk', 'vi'
 }

+# Semi-frequent languages that are excluded by the above:
+#
+#   - Chinese, not because it's detected incorrectly, but because we can't
+#     handle it until we already have word frequencies
+#   - Thai (seems to be detected whenever someone uses Thai characters in
+#     an emoticon)
+#   - Welsh (which is detected for "ohmygodohmygodohmygod")
+#   - Turkmen (detected for ASCII art)
+#   - Irish Gaelic (detected for Cthulhu-related text)
+#   - Kannada (looks of disapproval)
+#   - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes)
+#   - Breton (the word "memes" itself)

-def cld2_reddit_tokenizer(text):
+
+def cld2_surface_tokenizer(text, mode='twitter'):
    """
-    A language-detecting tokenizer with special cases for handling text from
-    Reddit.
+    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
+
+    The `mode` can be 'twitter' or 'reddit', which slightly changes the
+    pre-processing of the text.
    """
-    text = URL_RE.sub('', text)
-    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
+    text = unescape_html(text)
+    if mode == 'twitter':
+        text = TWITTER_HANDLE_RE.sub('', text)
+        text = TCO_RE.sub('', text)
+    elif mode == 'reddit':
+        text = URL_RE.sub('', text)
+        text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)
-    if lang not in KEEP_THESE_LANGUAGES:
-        # Reddit is 99.9% English, so if we detected a rare language, it's
-        # much more likely that it's actually English.
-        lang = 'en'

-    tokens = tokenize(text, lang, include_punctuation=True)
+    # If the detected language isn't in our pretty generous list of languages,
+    # return no tokens.
+    if lang not in KEEP_THESE_LANGUAGES:
+        return 'xx', []
+
+    # cld2's accuracy seems to improve dramatically with at least 50
+    # bytes of input, so throw away non-English below this length.
+    if len(text.encode('utf-8')) < 50 and lang != 'en':
+        return 'xx', []
+
+    tokens = tokenize(text, lang)
    return lang, tokens


@ -85,7 +101,12 @@ def cld2_detect_language(text):
    #       Confidence score: float))

    text = CLD2_BAD_CHARS_RE.sub('', text)
-    return pycld2.detect(text)[2][0][1]
+    lang = pycld2.detect(text)[2][0][1]
+
+    # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
+    # becomes 'zh'
+    code = langcodes.get(lang).language
+    return code


 def tokenize_by_language(in_filename, out_prefix, tokenizer):