diff --git a/scripts/ninja2dot.py b/scripts/ninja2dot.py
index 42b5362..f73131c 100644
--- a/scripts/ninja2dot.py
+++ b/scripts/ninja2dot.py
@@ -1,30 +1,39 @@
 """ This file generates a graph of the dependencies for the ninja build."""
 
 import sys
+import re
 
 
 def ninja_to_dot():
-    def last_component(path):
-        return path.split('/')[-1]
+    def simplified_filename(path):
+        component = path.split('/')[-1]
+        return re.sub(
+            r'[0-9]+-of', 'NN-of',
+            re.sub(r'part[0-9]+', 'partNN', component)
+        )
 
     print("digraph G {")
     print('rankdir="LR";')
+    seen_edges = set()
     for line in sys.stdin:
         line = line.rstrip()
         if line.startswith('build'):
             # the output file is the first argument; strip off the colon that
             # comes from ninja syntax
             output_text, input_text = line.split(':')
-            outfiles = [last_component(part) for part in output_text.split(' ')[1:]]
+            outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]]
             inputs = input_text.strip().split(' ')
-            infiles = [last_component(part) for part in inputs[1:]]
+            infiles = [simplified_filename(part) for part in inputs[1:]]
             operation = inputs[0]
             for infile in infiles:
                 if infile == '|':
                     # external dependencies start here; let's not graph those
                     break
                 for outfile in outfiles:
-                    print('"%s" -> "%s" [label="%s"]' % (infile, outfile, operation))
+                    edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)
+                    if edge not in seen_edges:
+                        seen_edges.add(edge)
+                        print(edge)
     print("}")
 
 
diff --git a/wordfreq_builder/build.png b/wordfreq_builder/build.png
new file mode 100644
index 0000000..15635c6
Binary files /dev/null and b/wordfreq_builder/build.png differ
diff --git a/wordfreq_builder/build.png.REMOVED.git-id b/wordfreq_builder/build.png.REMOVED.git-id
deleted file mode 100644
index 9fe6754..0000000
--- a/wordfreq_builder/build.png.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-ef54b21e931c530f5b75c1cd87c5841cc4691e43
\ No newline at end of file
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index b708533..f06e5f2 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -56,6 +56,11 @@ rule convert_leeds
 rule convert_opensubtitles
   command = tr ' ' ',' < $in > $out
 
+# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
+# tabs to commas and commas to nothing, and remove obvious mojibake.
+rule convert_subtlex
+  command = cut -f 1,$col $in | tail -n +2 | tr '	,' ', ' | grep -v 'â,' > $out
+
 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
 # lines with counts of 100 or more.
@@ -71,7 +76,10 @@ rule count
   command = python -m wordfreq_builder.cli.count_tokens $in $out
 
 rule merge
-  command = python -m wordfreq_builder.cli.combine_lists -o $out $in
+  command = python -m wordfreq_builder.cli.combine_lists -o $out -c $cutoff $in
+
+rule merge_counts
+  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
 
 rule freqs2cB
   command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
similarity index 70%
rename from wordfreq_builder/wordfreq_builder/cli/combine_lists.py
rename to wordfreq_builder/wordfreq_builder/cli/merge_counts.py
index 61d4b1d..772b951 100644
--- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@@ -1,12 +1,12 @@
-from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
+from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
 import argparse
 
 
 def merge_lists(input_names, output_name):
-    freq_dicts = []
+    count_dicts = []
     for input_name in input_names:
-        freq_dicts.append(read_freqs(input_name, cutoff=2))
-    merged = merge_freqs(freq_dicts)
+        count_dicts.append(read_freqs(input_name, cutoff=0))
+    merged = merge_counts(count_dicts)
     write_wordlist(merged, output_name)
 
 
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
new file mode 100644
index 0000000..0bbe1c1
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@@ -0,0 +1,20 @@
+from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
+import argparse
+
+
+def merge_lists(input_names, output_name, cutoff):
+    freq_dicts = []
+    for input_name in input_names:
+        freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
+    merged = merge_freqs(freq_dicts)
+    write_wordlist(merged, output_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
+    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
+    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    args = parser.parse_args()
+    merge_lists(args.inputs, args.output, args.cutoff)
+
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index a80c327..7c523fb 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -11,12 +11,12 @@ CONFIG = {
         'twitter': [
             'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
             'pt', 'ru',
-            # can be added later: 'th', 'tr'
+            # can be added later: 'el', 'tr'
         ],
         'wikipedia': [
             'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
             'pt', 'ru'
-            # many more can be added
+            # consider adding 'el' and 'tr'
         ],
         'opensubtitles': [
             # All languages where the most common word in OpenSubtitles
@@ -33,14 +33,19 @@ CONFIG = {
             'en',
             # Using the 2012 data, we could get French, German, Italian,
             # Russian, Spanish, and (Simplified) Chinese.
-        ]
+        ],
+        'subtlex-en': ['en'],
+        'subtlex-zh': ['zh'],
     },
+    # Subtlex languages that need to be pre-processed
     'wordlist_paths': {
         'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
         'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
         'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
         'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
         'google-books': 'generated/google-books/google_books_{lang}.{ext}',
+        'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
+        'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
         'combined': 'generated/combined/combined_{lang}.{ext}',
         'combined-dist': 'dist/combined_{lang}.{ext}',
         'twitter-dist': 'dist/twitter_{lang}.{ext}'
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 84c1818..65773d6 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -5,7 +5,8 @@ import sys
 import pathlib
 
 HEADER = """# This file is automatically generated. Do not edit it.
-# You can regenerate it using the 'wordfreq-build-deps' command.
+# You can change its behavior by editing wordfreq_builder/ninja.py,
+# and regenerate it by running 'make'.
 """
 TMPDIR = data_filename('tmp')
 
@@ -76,6 +77,18 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
             CONFIG['sources']['opensubtitles']
         )
     )
+    lines.extend(
+        subtlex_en_deps(
+            data_filename('source-lists/subtlex'),
+            CONFIG['sources']['subtlex-en']
+        )
+    )
+    lines.extend(
+        subtlex_zh_deps(
+            data_filename('source-lists/subtlex'),
+            CONFIG['sources']['subtlex-zh']
+        )
+    )
     lines.extend(combine_lists(all_languages()))
 
     print('\n'.join(lines), file=out)
@@ -188,12 +201,53 @@ def opensubtitles_deps(dirname_in, languages):
             prefix=dirname_in, lang=language
         )
         reformatted_file = wordlist_filename(
-            'opensubtitles', language, 'counts.txt')
+            'opensubtitles', language, 'counts.txt'
+        )
         add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
 
     return lines
 
 
+def subtlex_en_deps(dirname_in, languages):
+    lines = []
+    assert languages == ['en']
+    regions = ['en-US', 'en-GB']
+    processed_files = []
+    for region in regions:
+        input_file = '{prefix}/subtlex.{region}.txt'.format(
+            prefix=dirname_in, region=region
+        )
+        processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
+        processed_files.append(processed_file)
+        add_dep(
+            lines, 'convert_subtlex', input_file, processed_file,
+            params={'col': 2}
+        )
+
+    output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
+    add_dep(lines, 'merge_counts', processed_files, output_file)
+
+    return lines
+
+
+def subtlex_zh_deps(dirname_in, languages):
+    lines = []
+    for language in languages:
+        input_file = '{prefix}/subtlex.{lang}.txt'.format(
+            prefix=dirname_in, lang=language
+        )
+        processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
+        output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
+        add_dep(
+            lines, 'convert_subtlex', input_file, processed_file,
+            params={'col': 5}
+        )
+        add_dep(
+            lines, 'merge_counts', processed_file, output_file
+        )
+    return lines
+
+
 def combine_lists(languages):
     lines = []
     for language in languages:
@@ -204,7 +258,8 @@ def combine_lists(languages):
         ]
         output_file = wordlist_filename('combined', language)
         add_dep(lines, 'merge', input_files, output_file,
-                extra='wordfreq_builder/word_counts.py')
+                extra='wordfreq_builder/word_counts.py',
+                params={'cutoff': 2})
 
         output_cBpack = wordlist_filename(
             'combined-dist', language, 'msgpack.gz')
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 9da95a3..63d1980 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -49,13 +49,14 @@ def read_freqs(filename, cutoff=0, lang=None):
     with open(filename, encoding='utf-8', newline='') as infile:
         for key, strval in csv.reader(infile):
             val = float(strval)
+            key = fix_text(key)
             if val < cutoff:
                 break
             tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
             for token in tokens:
                 # Use += so that, if we give the reader concatenated files with
                 # duplicates, it does the right thing
-                raw_counts[fix_text(token)] += val
+                raw_counts[token] += val
                 total += val
 
     for word in raw_counts:
@@ -96,6 +97,17 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
         msgpack.dump(cBpack_data, outfile)
 
 
+def merge_counts(count_dicts):
+    """
+    Merge multiple dictionaries of counts by adding their entries.
+    """
+    merged = defaultdict(int)
+    for count_dict in count_dicts:
+        for term, count in count_dict.items():
+            merged[term] += count
+    return merged
+
+
 def merge_freqs(freq_dicts):
     """
     Merge multiple dictionaries of frequencies, representing each word with