Merge pull request #30 from LuminosoInsight/add-reddit

Add English data from Reddit corpus Former-commit-id: d18fee3d78
2024-12-23 09:21:37 +00:00 · 2016-01-14 15:52:39 -05:00 · 2016-01-14 15:52:39 -05:00 · 927d4f45a4
commit 927d4f45a4
parent eae7b2752e 6eca3cff5a
46 changed files with 121 additions and 32 deletions
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.2',
+    version='1.3',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pl.msgpack.gz
+++ b/wordfreq/data/combined_pl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_sv.msgpack.gz
+++ b/wordfreq/data/combined_sv.msgpack.gz
--- a/wordfreq/data/combined_tr.msgpack.gz
+++ b/wordfreq/data/combined_tr.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_el.msgpack.gz
+++ b/wordfreq/data/twitter_el.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pl.msgpack.gz
+++ b/wordfreq/data/twitter_pl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq/data/twitter_sv.msgpack.gz
+++ b/wordfreq/data/twitter_sv.msgpack.gz
--- a/wordfreq/data/twitter_tr.msgpack.gz
+++ b/wordfreq/data/twitter_tr.msgpack.gz
--- a/wordfreq_builder/lib/jq-linux64
+++ b/wordfreq_builder/lib/jq-linux64
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -13,7 +13,7 @@
 # contains the programatically-defined dependency graph.

 # Variables
-DATA = ./data
+JQ = lib/jq-linux64

 # How to build the build.ninja file itself. (Use the Makefile to get it the
 # first time.)
@ -92,10 +92,13 @@ rule merge
  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in

 rule merge_counts
-  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
+  command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in

 rule freqs2cB
  command = python -m wordfreq_builder.cli.freqs_to_cB $in $out

 rule cat
  command = cat $in > $out
+
+rule extract_reddit
+  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
 import argparse


-def merge_lists(input_names, output_name):
+def merge_lists(input_names, output_name, cutoff=0):
    count_dicts = []
    for input_name in input_names:
-        values, total = read_values(input_name, cutoff=0)
+        values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
        count_dicts.append(values)
    merged = merge_counts(count_dicts)
    write_wordlist(merged, output_name)
@ -13,8 +13,12 @@ def merge_lists(input_names, output_name):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
-    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    parser.add_argument('-o', '--output', default='combined-counts.csv',
+                        help='filename to write the output to')
+    parser.add_argument('-c', '--cutoff', type=int, default=0
+                        help='minimum count to read from an input file')
+    parser.add_argument('inputs', nargs='+',
+                        help='names of input files to merge')
    args = parser.parse_args()
-    merge_lists(args.inputs, args.output)
+    merge_lists(args.inputs, args.output, cutoff=args.cutoff)

--- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
-    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
-    parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
-    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    parser.add_argument('-o', '--output', default='combined-freqs.csv',
+                        help='filename to write the output to')
+    parser.add_argument('-c', '--cutoff', type=int, default=2,
+                        help='stop after seeing a count below this')
+    parser.add_argument('-l', '--language', default=None,
+                        help='language code for which language the words are in')
+    parser.add_argument('inputs', nargs='+',
+                        help='names of input files to merge')
    args = parser.parse_args()
    merge_lists(args.inputs, args.output, args.cutoff, args.language)

--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
@ -0,0 +1,14 @@
+from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', help='filename of input file containing one comment per line')
+    parser.add_argument('outprefix', help='prefix of output filenames')
+    args = parser.parse_args()
+    tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
+
+
+if __name__ == '__main__':
+    main()
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
@ -1,4 +1,4 @@
-from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
 import argparse


@ -7,7 +7,7 @@ def main():
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
+    tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)


 if __name__ == '__main__':
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -40,7 +40,8 @@ CONFIG = {
        ],
        'subtlex-en': ['en'],
        'subtlex-other': ['de', 'nl', 'zh'],
-        'jieba': ['zh']
+        'jieba': ['zh'],
+        'reddit': ['en'],
    },
    # Subtlex languages that need to be pre-processed
    'wordlist_paths': {
@ -52,6 +53,7 @@ CONFIG = {
        'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
        'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
        'jieba': 'generated/jieba/jieba_{lang}.{ext}',
+        'reddit': 'generated/reddit/reddit_{lang}.{ext}',
        'combined': 'generated/combined/combined_{lang}.{ext}',
        'combined-dist': 'dist/combined_{lang}.{ext}',
        'twitter-dist': 'dist/twitter_{lang}.{ext}',
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
            data_filename('source-lists/subtlex'),
            CONFIG['sources']['subtlex-other']
        ),
+        reddit_deps(
+            data_filename('raw-input/reddit'),
+            CONFIG['sources']['reddit']
+        ),
        jieba_deps(
            data_filename('source-lists/jieba'),
            CONFIG['sources']['jieba']
@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages):
    return lines


+def reddit_deps(dirname_in, languages):
+    lines = []
+    if not languages:
+        return lines
+    assert languages == ['en']
+
+    processed_files = []
+    path_in = pathlib.Path(dirname_in)
+    for filepath in path_in.glob('*/*.bz2'):
+        base = filepath.name[:-4]
+        transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
+        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
+        count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
+        add_dep(lines, 'count', transformed_file, count_file)
+        processed_files.append(count_file)
+
+    output_file = wordlist_filename('reddit', 'en', 'counts.txt')
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 3}
+    )
+    return lines
+
+
 # Which columns of the SUBTLEX data files do the word and its frequency appear
 # in?
 SUBTLEX_COLUMN_MAP = {
@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
        )

    output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
-    add_dep(lines, 'merge_counts', processed_files, output_file)
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 0}
+    )

    return lines

@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
        )
        add_dep(
-            lines, 'merge_counts', processed_file, output_file
+            lines, 'merge_counts', processed_file, output_file,
+            params={'cutoff': 0}
        )
    return lines

--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)

 TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
 TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
+URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
+MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')


 def cld2_surface_tokenizer(text):
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
+
    lang = cld2_detect_language(text)

    # Don't allow tokenization in Chinese when language-detecting, because
@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text):
    return lang, tokens


+# Low-frequency languages tend to be detected incorrectly. Keep a limited
+# list of languages we're allowed to use here.
+KEEP_THESE_LANGUAGES = {
+    'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
+    'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
+}
+
+
+def cld2_reddit_tokenizer(text):
+    """
+    A language-detecting tokenizer with special cases for handling text from
+    Reddit.
+    """
+    text = URL_RE.sub('', text)
+    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
+
+    lang = cld2_detect_language(text)
+    if lang not in KEEP_THESE_LANGUAGES:
+        # Reddit is 99.9% English, so if we detected a rare language, it's
+        # much more likely that it's actually English.
+        lang = 'en'
+
+    tokens = tokenize(text, lang, include_punctuation=True)
+    return lang, tokens
+
+
 def cld2_detect_language(text):
    """
    Uses CLD2 to detect the language.
@ -59,13 +88,11 @@ def cld2_detect_language(text):
    return pycld2.detect(text)[2][0][1]


-def tokenize_twitter(in_filename, out_prefix, tokenizer):
+def tokenize_by_language(in_filename, out_prefix, tokenizer):
    """
-    Process a file by running it through the Twitter-specific tokenizer,
-    which uses cld2 for language detection, and removes Twitter handles
-    and t.co URLs.
+    Process a file by running it through a given tokenizer.

-    Produces output files that are separated by language, with newlines
+    Produces output files that are separated by language, with spaces
    between the tokens.
    """
    out_files = {}
@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
            text = line.split('\t')[-1].strip()
            language, tokens = tokenizer(text)
            if language != 'un':
-                tokenized = '\n'.join(tokens)
+                tokenized = ' '.join(tokens)
                out_filename = '%s.%s.txt' % (out_prefix, language)
                if out_filename in out_files:
                    out_file = out_files[out_filename]
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -24,16 +24,19 @@ def count_tokens(filename):
    containing '<EFBFBD>'.
    """
    counts = defaultdict(int)
-    with open(filename, encoding='utf-8', errors='replace') as infile:
+    if filename.endswith('gz'):
+        infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
+    else:
+        infile = open(filename, encoding='utf-8', errors='replace')
    for line in infile:
        line = URL_RE.sub('', line.strip())
        for token in simple_tokenize(line):
            counts[token] += 1
-
+    infile.close()
    return counts


-def read_values(filename, cutoff=0, lang=None):
+def read_values(filename, cutoff=0, max_size=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.
@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None):
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
-            if val < cutoff:
+            if val < cutoff or len(values) >= max_size:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None):
    If lang is given, read_freqs will apply language specific preprocessing
    operations.
    """
-    values, total = read_values(filename, cutoff, lang)
+    values, total = read_values(filename, cutoff, lang=lang)
    for word in values:
        values[word] /= total