diff --git a/setup.py b/setup.py
index 2926f3f..87a0936 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
 
 setup(
     name="wordfreq",
-    version='1.2',
+    version='1.3',
     maintainer='Luminoso Technologies, Inc.',
     maintainer_email='info@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',
diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz
index 15d464b..1096472 100644
Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ
diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz
index 6b1f8e5..94af721 100644
Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ
diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz
index 1da9edc..856abc1 100644
Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ
diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz
index 642dc49..bbd0cc4 100644
Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ
diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz
index 58269d1..39f0eea 100644
Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ
diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz
index 7bd9905..6faea92 100644
Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ
diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz
index 9844891..9b33049 100644
Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ
diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz
index 3734388..741f518 100644
Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ
diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz
index 5477083..f1c660d 100644
Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ
diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz
index 05f30a3..5dda29a 100644
Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ
diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz
index 1aede1d..d7f4ad7 100644
Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ
diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz
index f6208a3..48d681a 100644
Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ
diff --git a/wordfreq/data/combined_pl.msgpack.gz b/wordfreq/data/combined_pl.msgpack.gz
index 6a70139..2d45b1a 100644
Binary files a/wordfreq/data/combined_pl.msgpack.gz and b/wordfreq/data/combined_pl.msgpack.gz differ
diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz
index 4009376..7371866 100644
Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ
diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz
index 4eeb762..123eb54 100644
Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ
diff --git a/wordfreq/data/combined_sv.msgpack.gz b/wordfreq/data/combined_sv.msgpack.gz
index 0c883ae..0cc1398 100644
Binary files a/wordfreq/data/combined_sv.msgpack.gz and b/wordfreq/data/combined_sv.msgpack.gz differ
diff --git a/wordfreq/data/combined_tr.msgpack.gz b/wordfreq/data/combined_tr.msgpack.gz
index 53ae060..3f6063c 100644
Binary files a/wordfreq/data/combined_tr.msgpack.gz and b/wordfreq/data/combined_tr.msgpack.gz differ
diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz
index 02b11c1..1205f84 100644
Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz
index 77f00de..d87307f 100644
Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ
diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz
index 2c4b131..9422fb5 100644
Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ
diff --git a/wordfreq/data/twitter_el.msgpack.gz b/wordfreq/data/twitter_el.msgpack.gz
index 4f063da..af1a0b1 100644
Binary files a/wordfreq/data/twitter_el.msgpack.gz and b/wordfreq/data/twitter_el.msgpack.gz differ
diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz
index 126d991..cb81d3d 100644
Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ
diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz
index e0a7518..9f80432 100644
Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ
diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz
index 115fe97..05de393 100644
Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ
diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz
index 081ddcb..579964a 100644
Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ
diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz
index f97e2a2..174235b 100644
Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz
index d428de3..8f739f9 100644
Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz
index 2e6678e..334a127 100644
Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz
index 9907914..346bdaa 100644
Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ
diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz
index 0a1183f..7681324 100644
Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ
diff --git a/wordfreq/data/twitter_pl.msgpack.gz b/wordfreq/data/twitter_pl.msgpack.gz
index 5a8d62a..11b61eb 100644
Binary files a/wordfreq/data/twitter_pl.msgpack.gz and b/wordfreq/data/twitter_pl.msgpack.gz differ
diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz
index e243cc7..0e845ab 100644
Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz
index 01cd2de..e426344 100644
Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ
diff --git a/wordfreq/data/twitter_sv.msgpack.gz b/wordfreq/data/twitter_sv.msgpack.gz
index 333f2a7..ab1e956 100644
Binary files a/wordfreq/data/twitter_sv.msgpack.gz and b/wordfreq/data/twitter_sv.msgpack.gz differ
diff --git a/wordfreq/data/twitter_tr.msgpack.gz b/wordfreq/data/twitter_tr.msgpack.gz
index 9a955b3..28eefa6 100644
Binary files a/wordfreq/data/twitter_tr.msgpack.gz and b/wordfreq/data/twitter_tr.msgpack.gz differ
diff --git a/wordfreq_builder/lib/jq-linux64 b/wordfreq_builder/lib/jq-linux64
new file mode 100755
index 0000000..939227e
Binary files /dev/null and b/wordfreq_builder/lib/jq-linux64 differ
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index df00062..ac9d4a0 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -13,7 +13,7 @@
 # contains the programatically-defined dependency graph.
 
 # Variables
-DATA = ./data
+JQ = lib/jq-linux64
 
 # How to build the build.ninja file itself. (Use the Makefile to get it the
 # first time.)
@@ -92,10 +92,13 @@ rule merge
   command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
 
 rule merge_counts
-  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
+  command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
 
 rule freqs2cB
   command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
 
 rule cat
   command = cat $in > $out
+
+rule extract_reddit
+  command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/&gt;/>/g' | sed 's/&lt;/</g' | sed 's/&amp;/\&/g' | gzip -c > $out
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
index 5e3de69..4efe1d9 100644
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
 import argparse
 
 
-def merge_lists(input_names, output_name):
+def merge_lists(input_names, output_name, cutoff=0):
     count_dicts = []
     for input_name in input_names:
-        values, total = read_values(input_name, cutoff=0)
+        values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
         count_dicts.append(values)
     merged = merge_counts(count_dicts)
     write_wordlist(merged, output_name)
@@ -13,8 +13,12 @@ def merge_lists(input_names, output_name):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
-    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    parser.add_argument('-o', '--output', default='combined-counts.csv',
+                        help='filename to write the output to')
+    parser.add_argument('-c', '--cutoff', type=int, default=0
+                        help='minimum count to read from an input file')
+    parser.add_argument('inputs', nargs='+',
+                        help='names of input files to merge')
     args = parser.parse_args()
-    merge_lists(args.inputs, args.output)
+    merge_lists(args.inputs, args.output, cutoff=args.cutoff)
 
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
index ddc308c..e16660b 100644
--- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
-    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
-    parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
-    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    parser.add_argument('-o', '--output', default='combined-freqs.csv',
+                        help='filename to write the output to')
+    parser.add_argument('-c', '--cutoff', type=int, default=2,
+                        help='stop after seeing a count below this')
+    parser.add_argument('-l', '--language', default=None,
+                        help='language code for which language the words are in')
+    parser.add_argument('inputs', nargs='+',
+                        help='names of input files to merge')
     args = parser.parse_args()
     merge_lists(args.inputs, args.output, args.cutoff, args.language)
 
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
new file mode 100644
index 0000000..6a275b3
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
@@ -0,0 +1,14 @@
+from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', help='filename of input file containing one comment per line')
+    parser.add_argument('outprefix', help='prefix of output filenames')
+    args = parser.parse_args()
+    tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
index 879caa4..d144866 100644
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
@@ -1,4 +1,4 @@
-from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
 import argparse
 
 
@@ -7,7 +7,7 @@ def main():
     parser.add_argument('filename', help='filename of input file containing one tweet per line')
     parser.add_argument('outprefix', help='prefix of output filenames')
     args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
+    tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
 
 
 if __name__ == '__main__':
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index 7ae1798..e0006e1 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -40,7 +40,8 @@ CONFIG = {
         ],
         'subtlex-en': ['en'],
         'subtlex-other': ['de', 'nl', 'zh'],
-        'jieba': ['zh']
+        'jieba': ['zh'],
+        'reddit': ['en'],
     },
     # Subtlex languages that need to be pre-processed
     'wordlist_paths': {
@@ -52,6 +53,7 @@ CONFIG = {
         'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
         'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
         'jieba': 'generated/jieba/jieba_{lang}.{ext}',
+        'reddit': 'generated/reddit/reddit_{lang}.{ext}',
         'combined': 'generated/combined/combined_{lang}.{ext}',
         'combined-dist': 'dist/combined_{lang}.{ext}',
         'twitter-dist': 'dist/twitter_{lang}.{ext}',
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 80437ff..dc2a058 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
             data_filename('source-lists/subtlex'),
             CONFIG['sources']['subtlex-other']
         ),
+        reddit_deps(
+            data_filename('raw-input/reddit'),
+            CONFIG['sources']['reddit']
+        ),
         jieba_deps(
             data_filename('source-lists/jieba'),
             CONFIG['sources']['jieba']
@@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages):
     return lines
 
 
+def reddit_deps(dirname_in, languages):
+    lines = []
+    if not languages:
+        return lines
+    assert languages == ['en']
+
+    processed_files = []
+    path_in = pathlib.Path(dirname_in)
+    for filepath in path_in.glob('*/*.bz2'):
+        base = filepath.name[:-4]
+        transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
+        add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
+        count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
+        add_dep(lines, 'count', transformed_file, count_file)
+        processed_files.append(count_file)
+
+    output_file = wordlist_filename('reddit', 'en', 'counts.txt')
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 3}
+    )
+    return lines
+
+
 # Which columns of the SUBTLEX data files do the word and its frequency appear
 # in?
 SUBTLEX_COLUMN_MAP = {
@@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
         )
 
     output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
-    add_dep(lines, 'merge_counts', processed_files, output_file)
+    add_dep(
+        lines, 'merge_counts', processed_files, output_file,
+        params={'cutoff': 0}
+    )
 
     return lines
 
@@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
             params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
         )
         add_dep(
-            lines, 'merge_counts', processed_file, output_file
+            lines, 'merge_counts', processed_file, output_file,
+            params={'cutoff': 0}
         )
     return lines
 
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 7d18026..b47e94a 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
 
 TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
 TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
+URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
+MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
 
 
 def cld2_surface_tokenizer(text):
@@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
     text = unescape_html(text)
     text = TWITTER_HANDLE_RE.sub('', text)
     text = TCO_RE.sub('', text)
+
     lang = cld2_detect_language(text)
 
     # Don't allow tokenization in Chinese when language-detecting, because
@@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text):
     return lang, tokens
 
 
+# Low-frequency languages tend to be detected incorrectly. Keep a limited
+# list of languages we're allowed to use here.
+KEEP_THESE_LANGUAGES = {
+    'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
+    'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
+}
+
+
+def cld2_reddit_tokenizer(text):
+    """
+    A language-detecting tokenizer with special cases for handling text from
+    Reddit.
+    """
+    text = URL_RE.sub('', text)
+    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
+
+    lang = cld2_detect_language(text)
+    if lang not in KEEP_THESE_LANGUAGES:
+        # Reddit is 99.9% English, so if we detected a rare language, it's
+        # much more likely that it's actually English.
+        lang = 'en'
+
+    tokens = tokenize(text, lang, include_punctuation=True)
+    return lang, tokens
+
+
 def cld2_detect_language(text):
     """
     Uses CLD2 to detect the language.
@@ -59,13 +88,11 @@ def cld2_detect_language(text):
     return pycld2.detect(text)[2][0][1]
 
 
-def tokenize_twitter(in_filename, out_prefix, tokenizer):
+def tokenize_by_language(in_filename, out_prefix, tokenizer):
     """
-    Process a file by running it through the Twitter-specific tokenizer,
-    which uses cld2 for language detection, and removes Twitter handles
-    and t.co URLs.
+    Process a file by running it through a given tokenizer.
 
-    Produces output files that are separated by language, with newlines
+    Produces output files that are separated by language, with spaces
     between the tokens.
     """
     out_files = {}
@@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
             text = line.split('\t')[-1].strip()
             language, tokens = tokenizer(text)
             if language != 'un':
-                tokenized = '\n'.join(tokens)
+                tokenized = ' '.join(tokens)
                 out_filename = '%s.%s.txt' % (out_prefix, language)
                 if out_filename in out_files:
                     out_file = out_files[out_filename]
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index a3bf0ae..47de7e5 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -24,16 +24,19 @@ def count_tokens(filename):
     containing '�'.
     """
     counts = defaultdict(int)
-    with open(filename, encoding='utf-8', errors='replace') as infile:
-        for line in infile:
-            line = URL_RE.sub('', line.strip())
-            for token in simple_tokenize(line):
-                counts[token] += 1
-
+    if filename.endswith('gz'):
+        infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
+    else:
+        infile = open(filename, encoding='utf-8', errors='replace')
+    for line in infile:
+        line = URL_RE.sub('', line.strip())
+        for token in simple_tokenize(line):
+            counts[token] += 1
+    infile.close()
     return counts
 
 
-def read_values(filename, cutoff=0, lang=None):
+def read_values(filename, cutoff=0, max_size=1e8, lang=None):
     """
     Read words and their frequency or count values from a CSV file. Returns
     a dictionary of values and the total of all values.
@@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None):
         for key, strval in csv.reader(infile):
             val = float(strval)
             key = fix_text(key)
-            if val < cutoff:
+            if val < cutoff or len(values) >= max_size:
                 break
             tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
             for token in tokens:
@@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None):
     If lang is given, read_freqs will apply language specific preprocessing
     operations.
     """
-    values, total = read_values(filename, cutoff, lang)
+    values, total = read_values(filename, cutoff, lang=lang)
     for word in values:
         values[word] /= total