diff --git a/.gitignore b/.gitignore
index 975f163..a68e8ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ pip-log.txt
 .coverage
 *~
 wordfreq-data.tar.gz
+.idea
+build.dot
diff --git a/README.md b/README.md
index c0eb421..0bba163 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ install them on Ubuntu:
 ## Usage
 
 wordfreq provides access to estimates of the frequency with which a word is
-used, in 15 languages (see *Supported languages* below). It loads
+used, in 16 languages (see *Supported languages* below). It loads
 efficiently-packed data structures that contain all words that appear at least
 once per million words.
 
@@ -118,34 +118,38 @@ of word usage on different topics at different levels of formality. The sources
 - **GBooks**: Google Books Ngrams 2013
 - **LeedsIC**: The Leeds Internet Corpus
 - **OpenSub**: OpenSubtitles
+- **SUBTLEX**: The SUBTLEX word frequency lists
 - **Twitter**: Messages sampled from Twitter's public stream
 - **Wikipedia**: The full text of Wikipedia in 2015
 
-The following 12 languages are well-supported, using at least 3 different sources
-of word frequencies:
+The following 14 languages are well-supported, with reasonable tokenization and
+at least 3 different sources of word frequencies:
 
-    Language    Code    GBooks  LeedsIC OpenSub Twitter Wikipedia
-    ──────────────────┼──────────────────────────────────────────
-    Arabic      ar    │ -       Yes     Yes     Yes     Yes
-    German      de    │ -       Yes     Yes     Yes[1]  Yes
-    English     en    │ Yes     Yes     Yes     Yes     Yes
-    Spanish     es    │ -       Yes     Yes     Yes     Yes
-    French      fr    │ -       Yes     Yes     Yes     Yes
-    Indonesian  id    │ -       -       Yes     Yes     Yes
-    Italian     it    │ -       Yes     Yes     Yes     Yes
-    Japanese    ja    │ -       Yes     -       Yes     Yes
-    Malay       ms    │ -       -       Yes     Yes     Yes
-    Dutch       nl    │ -       -       Yes     Yes     Yes
-    Portuguese  pt    │ -       Yes     Yes     Yes     Yes
-    Russian     ru    │ -       Yes     Yes     Yes     Yes
+    Language    Code    GBooks  SUBTLEX LeedsIC OpenSub Twitter Wikipedia
+    ──────────────────┼──────────────────────────────────────────────────
+    Arabic      ar    │ -       -       Yes     Yes     Yes     Yes
+    German      de    │ -       Yes     Yes     -       Yes[1]  Yes
+    Greek       el    │ -       -       Yes     Yes     Yes     Yes
+    English     en    │ Yes     Yes     Yes     Yes     Yes     Yes
+    Spanish     es    │ -       -       Yes     Yes     Yes     Yes
+    French      fr    │ -       -       Yes     Yes     Yes     Yes
+    Indonesian  id    │ -       -       -       Yes     Yes     Yes
+    Italian     it    │ -       -       Yes     Yes     Yes     Yes
+    Japanese    ja    │ -       -       Yes     -       Yes     Yes
+    Malay       ms    │ -       -       -       Yes     Yes     Yes
+    Dutch       nl    │ -       Yes     -       Yes     Yes     Yes
+    Portuguese  pt    │ -       -       Yes     Yes     Yes     Yes
+    Russian     ru    │ -       -       Yes     Yes     Yes     Yes
+    Turkish     tr    │ -       -       -       Yes     Yes     Yes
 
-These 3 languages are only marginally supported so far:
+These languages are only marginally supported so far. We have too few data
+sources so far in Korean (feel free to suggest some), and we are lacking
+tokenization support for Chinese.
 
-    Language    Code    GBooks  LeedsIC OpenSub Twitter Wikipedia
-    ──────────────────┼──────────────────────────────────────────
-    Greek       el    │ -       Yes     Yes     -       -
-    Korean      ko    │ -       -       -       Yes     Yes
-    Chinese     zh    │ -       Yes     Yes     -       -
+    Language    Code    GBooks  SUBTLEX LeedsIC OpenSub Twitter Wikipedia
+    ──────────────────┼──────────────────────────────────────────────────
+    Korean      ko    │ -       -       -       -       Yes     Yes
+    Chinese     zh    │ -       Yes     Yes     Yes     -       -
 
 [1] We've counted the frequencies from tweets in German, such as they are, but
 you should be aware that German is not a frequently-used language on Twitter.
@@ -219,7 +223,58 @@ sources:
 
 - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
 
+It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
+SUBTLEX-CH, created by Marc Brysbaert et al. and available at
+http://crr.ugent.be/programs-data/subtitle-frequencies.
+
+I (Rob Speer) have
+obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
+in wordfreq, to be used for any purpose, not just for academic use, under these
+conditions:
+
+- Wordfreq and code derived from it must credit the SUBTLEX authors.
+- It must remain clear that SUBTLEX is freely available data.
+
+These terms are similar to the Creative Commons Attribution-ShareAlike license.
+
 Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
 Policy. This software gives statistics about words that are commonly used on
 Twitter; it does not display or republish any Twitter content.
+
+## Citations to work that wordfreq is built on
+
+- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical
+  Evaluation of Current Word Frequency Norms and the Introduction of a New and
+  Improved Word Frequency Measure for American English. Behavior Research
+  Methods, 41 (4), 977-990.
+  http://sites.google.com/site/borisnew/pub/BrysbaertNew2009.pdf
+
+- Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A. M., Bölte, J., & Böhl, A.
+  (2015). The word frequency effect. Experimental Psychology.
+  http://econtent.hogrefe.com/doi/abs/10.1027/1618-3169/a000123?journalCode=zea
+
+- Cai, Q., & Brysbaert, M. (2010). SUBTLEX-CH: Chinese word and character
+  frequencies based on film subtitles. PLoS One, 5(6), e10729.
+  http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729
+
+- Dave, H. (2011). Frequency word lists.
+  https://invokeit.wordpress.com/frequency-word-lists/
+
+- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29.
+  http://unicode.org/reports/tr29/
+
+- Keuleers, E., Brysbaert, M. & New, B. (2010). SUBTLEX-NL: A new frequency
+  measure for Dutch words based on film subtitles. Behavior Research Methods,
+  42(3), 643-650.
+  http://crr.ugent.be/papers/SUBTLEX-NL_BRM.pdf
+
+- Kudo, T. (2005). Mecab: Yet another part-of-speech and morphological
+  analyzer.
+  http://mecab.sourceforge.net/
+
+- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014).
+  SUBTLEX-UK: A new and improved word frequency database for British English.
+  The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
+  http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
+
diff --git a/scripts/ninja2dot.py b/scripts/ninja2dot.py
index 42b5362..f73131c 100644
--- a/scripts/ninja2dot.py
+++ b/scripts/ninja2dot.py
@@ -1,30 +1,39 @@
 """ This file generates a graph of the dependencies for the ninja build."""
 
 import sys
+import re
 
 
 def ninja_to_dot():
-    def last_component(path):
-        return path.split('/')[-1]
+    def simplified_filename(path):
+        component = path.split('/')[-1]
+        return re.sub(
+            r'[0-9]+-of', 'NN-of',
+            re.sub(r'part[0-9]+', 'partNN', component)
+        )
 
     print("digraph G {")
     print('rankdir="LR";')
+    seen_edges = set()
     for line in sys.stdin:
         line = line.rstrip()
         if line.startswith('build'):
             # the output file is the first argument; strip off the colon that
             # comes from ninja syntax
             output_text, input_text = line.split(':')
-            outfiles = [last_component(part) for part in output_text.split(' ')[1:]]
+            outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]]
             inputs = input_text.strip().split(' ')
-            infiles = [last_component(part) for part in inputs[1:]]
+            infiles = [simplified_filename(part) for part in inputs[1:]]
             operation = inputs[0]
             for infile in infiles:
                 if infile == '|':
                     # external dependencies start here; let's not graph those
                     break
                 for outfile in outfiles:
-                    print('"%s" -> "%s" [label="%s"]' % (infile, outfile, operation))
+                    edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)
+                    if edge not in seen_edges:
+                        seen_edges.add(edge)
+                        print(edge)
     print("}")
 
 
diff --git a/tests/test.py b/tests/test.py
index 0a8e212..21dd9ad 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -19,7 +19,7 @@ def test_freq_examples():
 def test_languages():
     # Make sure the number of available languages doesn't decrease
     avail = available_languages()
-    assert_greater(len(avail), 14)
+    assert_greater(len(avail), 15)
 
     # Laughter is the universal language
     for lang in avail:
@@ -36,7 +36,7 @@ def test_languages():
 
 def test_twitter():
     avail = available_languages('twitter')
-    assert_greater(len(avail), 12)
+    assert_greater(len(avail), 14)
 
     for lang in avail:
         assert_greater(word_frequency('rt', lang, 'twitter'),
@@ -68,6 +68,7 @@ def test_most_common_words():
     eq_(get_most_common('nl'), 'de')
     eq_(get_most_common('pt'), 'de')
     eq_(get_most_common('ru'), 'в')
+    eq_(get_most_common('tr'), 'bir')
     eq_(get_most_common('zh'), '的')
 
 
@@ -111,6 +112,8 @@ def test_tokenization():
 def test_casefolding():
     eq_(tokenize('WEISS', 'de'), ['weiss'])
     eq_(tokenize('weiß', 'de'), ['weiss'])
+    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
+    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
 
 
 def test_phrase_freq():
diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz
index 489af0b..024d15a 100644
Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ
diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz
index 417253a..01b582a 100644
Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ
diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz
index da19b35..de5fc2a 100644
Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ
diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz
index 32e455e..fa8a955 100644
Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ
diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz
index 43a1ea4..5edb142 100644
Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ
diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz
index e337d96..3d12c37 100644
Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ
diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz
index 83ac294..611d7e9 100644
Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ
diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz
index f357cfa..9480331 100644
Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ
diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz
index e06c840..7668e78 100644
Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ
diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz
index fed4292..1424631 100644
Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ
diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz
index 264612f..f4355ea 100644
Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ
diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz
index 33dda68..3a20c21 100644
Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ
diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz
index d63551f..49548be 100644
Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ
diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz
index c4585cd..9bf91ab 100644
Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ
diff --git a/wordfreq/data/combined_tr.msgpack.gz b/wordfreq/data/combined_tr.msgpack.gz
new file mode 100644
index 0000000..e0feca0
Binary files /dev/null and b/wordfreq/data/combined_tr.msgpack.gz differ
diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz
index 0e9581b..c16cfbc 100644
Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz
index 830f7a2..7983403 100644
Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ
diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz
index d6bbc83..e47744c 100644
Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ
diff --git a/wordfreq/data/twitter_el.msgpack.gz b/wordfreq/data/twitter_el.msgpack.gz
new file mode 100644
index 0000000..bdf5d40
Binary files /dev/null and b/wordfreq/data/twitter_el.msgpack.gz differ
diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz
index d305533..f9e2299 100644
Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ
diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz
index fb03dcc..a76fedc 100644
Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ
diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz
index 0540be2..fbd4a6b 100644
Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ
diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz
index 3295083..0f25751 100644
Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ
diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz
index 40b1bd8..fad7127 100644
Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz
index 9826353..7196ff0 100644
Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz
index cab27b3..cb5c2c2 100644
Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz
index 0b422c5..e36090b 100644
Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ
diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz
index 015db77..7d99d85 100644
Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ
diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz
index bd663ae..2749a10 100644
Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz
index 395018b..56c2fc9 100644
Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ
diff --git a/wordfreq/data/twitter_tr.msgpack.gz b/wordfreq/data/twitter_tr.msgpack.gz
new file mode 100644
index 0000000..7edc781
Binary files /dev/null and b/wordfreq/data/twitter_tr.msgpack.gz differ
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index eb2c631..e33ca1d 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -65,6 +65,15 @@ def simple_tokenize(text):
     return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 
 
+def turkish_tokenize(text):
+    """
+    Like `simple_tokenize`, but modifies i's so that they case-fold correctly
+    in Turkish.
+    """
+    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+
+
 def remove_arabic_marks(text):
     """
     Remove decorations from Arabic words:
@@ -90,6 +99,8 @@ def tokenize(text, lang):
     - Chinese or Japanese texts that aren't identified as the appropriate
       language will only split on punctuation and script boundaries, giving
       you untokenized globs of characters that probably represent many words.
+    - Turkish will use a different case-folding procedure, so that capital
+      I and İ map to ı and i respectively.
     - All other languages will be tokenized using a regex that mostly
       implements the Word Segmentation section of Unicode Annex #29.
       See `simple_tokenize` for details.
@@ -107,6 +118,9 @@ def tokenize(text, lang):
             from wordfreq.mecab import mecab_tokenize
         return mecab_tokenize(text)
 
+    if lang == 'tr':
+        return turkish_tokenize(text)
+
     if lang == 'ar':
         text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
 
diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md
index 2aedf27..af47613 100644
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@@ -161,3 +161,34 @@ longer represents the words 'don' and 'won', as we assume most of their
 frequency comes from "don't" and "won't". Words that turned into similarly
 common words, however, were left alone: this list doesn't represent "can't"
 because the word was left as "can".
+
+### SUBTLEX
+
+Marc Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in
+wordfreq and derived works without the "academic use" restriction, under the
+following reasonable conditions:
+
+- Wordfreq and code derived from it must credit the SUBTLEX authors.
+  (See the citations in the top-level `README.md` file.)
+- It must remain clear that SUBTLEX is freely available data.
+
+`data/source-lists/subtlex` contains the following files:
+
+- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
+  and exported from Excel format to tab-separated UTF-8 using LibreOffice
+- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
+  extracted, and converted from ISO-8859-1 to UTF-8
+- `subtlex.en-GB.txt`, which was downloaded as
+  [SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
+  tab-separated UTF-8 using LibreOffice
+- `subtlex.nl.txt`, which was downloaded as
+  [SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
+- `subtlex.zh.txt`, which was downloaded as
+  [subtlexch131210.zip][subtlex-ch] and extracted
+
+[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
+[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
+[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
+[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
+[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
+
diff --git a/wordfreq_builder/build.png b/wordfreq_builder/build.png
index ef54b21..15635c6 100644
Binary files a/wordfreq_builder/build.png and b/wordfreq_builder/build.png differ
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index b708533..986678c 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -56,6 +56,12 @@ rule convert_leeds
 rule convert_opensubtitles
   command = tr ' ' ',' < $in > $out
 
+# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
+# run it through ftfy, convert tabs to commas and spurious CSV formatting to
+# and remove lines with unfixable half-mojibake.
+rule convert_subtlex
+  command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr '	",' ',  ' | grep -v 'â,' > $out
+
 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
 # lines with counts of 100 or more.
@@ -71,7 +77,10 @@ rule count
   command = python -m wordfreq_builder.cli.count_tokens $in $out
 
 rule merge
-  command = python -m wordfreq_builder.cli.combine_lists -o $out $in
+  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
+
+rule merge_counts
+  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
 
 rule freqs2cB
   command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
similarity index 66%
rename from wordfreq_builder/wordfreq_builder/cli/combine_lists.py
rename to wordfreq_builder/wordfreq_builder/cli/merge_counts.py
index 61d4b1d..5e3de69 100644
--- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@@ -1,12 +1,13 @@
-from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
+from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist
 import argparse
 
 
 def merge_lists(input_names, output_name):
-    freq_dicts = []
+    count_dicts = []
     for input_name in input_names:
-        freq_dicts.append(read_freqs(input_name, cutoff=2))
-    merged = merge_freqs(freq_dicts)
+        values, total = read_values(input_name, cutoff=0)
+        count_dicts.append(values)
+    merged = merge_counts(count_dicts)
     write_wordlist(merged, output_name)
 
 
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
new file mode 100644
index 0000000..0bbe1c1
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@@ -0,0 +1,20 @@
+from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
+import argparse
+
+
+def merge_lists(input_names, output_name, cutoff):
+    freq_dicts = []
+    for input_name in input_names:
+        freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
+    merged = merge_freqs(freq_dicts)
+    write_wordlist(merged, output_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
+    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
+    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    args = parser.parse_args()
+    merge_lists(args.inputs, args.output, args.cutoff)
+
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index a80c327..dc61bc6 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -8,20 +8,25 @@ CONFIG = {
     'sources': {
         # A list of language codes (possibly un-standardized) that we'll
         # look up in filenames for these various data sources.
+        #
+        # Consider adding:
+        # 'th' when we get tokenization for it
+        # 'hi' when we stop messing up its tokenization
+        # 'tl' because it's probably ready right now
+        # 'pl' because we have 3 sources for it
         'twitter': [
-            'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
-            'pt', 'ru',
-            # can be added later: 'th', 'tr'
+            'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
+            'pt', 'ru', 'tr'
         ],
         'wikipedia': [
-            'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
-            'pt', 'ru'
-            # many more can be added
+            'ar', 'de', 'en', 'el', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
+            'pt', 'ru', 'tr'
         ],
         'opensubtitles': [
-            # All languages where the most common word in OpenSubtitles
-            # appears at least 5000 times
-            'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
+            # This list includes languages where the most common word in
+            # OpenSubtitles appears at least 5000 times. However, we exclude
+            # German, where SUBTLEX has done better processing of the same data.
+            'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'el', 'en', 'es', 'et',
             'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv',
             'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq',
             'sr', 'sv', 'tr', 'uk', 'zh'
@@ -33,14 +38,19 @@ CONFIG = {
             'en',
             # Using the 2012 data, we could get French, German, Italian,
             # Russian, Spanish, and (Simplified) Chinese.
-        ]
+        ],
+        'subtlex-en': ['en'],
+        'subtlex-other': ['de', 'nl', 'zh'],
     },
+    # Subtlex languages that need to be pre-processed
     'wordlist_paths': {
         'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
         'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
         'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
         'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
         'google-books': 'generated/google-books/google_books_{lang}.{ext}',
+        'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
+        'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
         'combined': 'generated/combined/combined_{lang}.{ext}',
         'combined-dist': 'dist/combined_{lang}.{ext}',
         'twitter-dist': 'dist/twitter_{lang}.{ext}'
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 84c1818..293eb0d 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -5,7 +5,8 @@ import sys
 import pathlib
 
 HEADER = """# This file is automatically generated. Do not edit it.
-# You can regenerate it using the 'wordfreq-build-deps' command.
+# You can change its behavior by editing wordfreq_builder/ninja.py,
+# and regenerate it by running 'make'.
 """
 TMPDIR = data_filename('tmp')
 
@@ -76,6 +77,18 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
             CONFIG['sources']['opensubtitles']
         )
     )
+    lines.extend(
+        subtlex_en_deps(
+            data_filename('source-lists/subtlex'),
+            CONFIG['sources']['subtlex-en']
+        )
+    )
+    lines.extend(
+        subtlex_other_deps(
+            data_filename('source-lists/subtlex'),
+            CONFIG['sources']['subtlex-other']
+        )
+    )
     lines.extend(combine_lists(all_languages()))
 
     print('\n'.join(lines), file=out)
@@ -140,7 +153,8 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
             for language in languages
         ]
         add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
-                params={'prefix': slice_file})
+                params={'prefix': slice_file},
+                extra='wordfreq_builder/tokenizers.py')
 
     for language in languages:
         combined_output = wordlist_filename('twitter', language, 'tokens.txt')
@@ -188,12 +202,69 @@ def opensubtitles_deps(dirname_in, languages):
             prefix=dirname_in, lang=language
         )
         reformatted_file = wordlist_filename(
-            'opensubtitles', language, 'counts.txt')
+            'opensubtitles', language, 'counts.txt'
+        )
         add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
 
     return lines
 
 
+# Which columns of the SUBTLEX data files do the word and its frequency appear
+# in?
+SUBTLEX_COLUMN_MAP = {
+    'de': (1, 3),
+    'el': (2, 3),
+    'en': (1, 2),
+    'nl': (1, 2),
+    'zh': (1, 5)
+}
+
+
+def subtlex_en_deps(dirname_in, languages):
+    lines = []
+    assert languages == ['en']
+    regions = ['en-US', 'en-GB']
+    processed_files = []
+    for region in regions:
+        input_file = '{prefix}/subtlex.{region}.txt'.format(
+            prefix=dirname_in, region=region
+        )
+        textcol, freqcol = SUBTLEX_COLUMN_MAP['en']
+        processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
+        processed_files.append(processed_file)
+        add_dep(
+            lines, 'convert_subtlex', input_file, processed_file,
+            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
+        )
+
+    output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
+    add_dep(lines, 'merge_counts', processed_files, output_file)
+
+    return lines
+
+
+def subtlex_other_deps(dirname_in, languages):
+    lines = []
+    for language in languages:
+        input_file = '{prefix}/subtlex.{lang}.txt'.format(
+            prefix=dirname_in, lang=language
+        )
+        processed_file = wordlist_filename('subtlex-other', language, 'processed.txt')
+        output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
+        textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
+
+        # Skip one header line by setting 'startrow' to 2 (because tail is 1-based).
+        # I hope we don't need to configure this by language anymore.
+        add_dep(
+            lines, 'convert_subtlex', input_file, processed_file,
+            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
+        )
+        add_dep(
+            lines, 'merge_counts', processed_file, output_file
+        )
+    return lines
+
+
 def combine_lists(languages):
     lines = []
     for language in languages:
@@ -204,7 +275,8 @@ def combine_lists(languages):
         ]
         output_file = wordlist_filename('combined', language)
         add_dep(lines, 'merge', input_files, output_file,
-                extra='wordfreq_builder/word_counts.py')
+                extra='wordfreq_builder/word_counts.py',
+                params={'cutoff': 2})
 
         output_cBpack = wordlist_filename(
             'combined-dist', language, 'msgpack.gz')
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 92d0714..1a75626 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -13,7 +13,8 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
         '\ufdd0-\ufdef',
         '\N{HANGUL FILLER}',
         '\N{HANGUL CHOSEONG FILLER}',
-        '\N{HANGUL JUNGSEONG FILLER}'
+        '\N{HANGUL JUNGSEONG FILLER}',
+        '<>'
     ] +
     [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
 )
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 9da95a3..1933295 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -32,9 +32,40 @@ def count_tokens(filename):
     return counts
 
 
+def read_values(filename, cutoff=0, lang=None):
+    """
+    Read words and their frequency or count values from a CSV file. Returns
+    a dictionary of values and the total of all values.
+
+    Only words with a value greater than or equal to `cutoff` are returned.
+
+    If `cutoff` is greater than 0, the csv file must be sorted by value
+    in descending order.
+
+    If lang is given, it will apply language specific preprocessing
+    operations.
+    """
+    values = defaultdict(float)
+    total = 0.
+    with open(filename, encoding='utf-8', newline='') as infile:
+        for key, strval in csv.reader(infile):
+            val = float(strval)
+            key = fix_text(key)
+            if val < cutoff:
+                break
+            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
+            for token in tokens:
+                # Use += so that, if we give the reader concatenated files with
+                # duplicates, it does the right thing
+                values[token] += val
+                total += val
+    return values, total
+
+
 def read_freqs(filename, cutoff=0, lang=None):
     """
-    Read words and their frequencies from a CSV file.
+    Read words and their frequencies from a CSV file, normalizing the
+    frequencies to add up to 1.
 
     Only words with a frequency greater than or equal to `cutoff` are returned.
 
@@ -44,24 +75,11 @@ def read_freqs(filename, cutoff=0, lang=None):
     If lang is given, read_freqs will apply language specific preprocessing
     operations.
     """
-    raw_counts = defaultdict(float)
-    total = 0.
-    with open(filename, encoding='utf-8', newline='') as infile:
-        for key, strval in csv.reader(infile):
-            val = float(strval)
-            if val < cutoff:
-                break
-            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
-            for token in tokens:
-                # Use += so that, if we give the reader concatenated files with
-                # duplicates, it does the right thing
-                raw_counts[fix_text(token)] += val
-                total += val
+    values, total = read_values(filename, cutoff, lang)
+    for word in values:
+        values[word] /= total
 
-    for word in raw_counts:
-        raw_counts[word] /= total
-
-    return raw_counts
+    return values
 
 
 def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
@@ -96,6 +114,17 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
         msgpack.dump(cBpack_data, outfile)
 
 
+def merge_counts(count_dicts):
+    """
+    Merge multiple dictionaries of counts by adding their entries.
+    """
+    merged = defaultdict(int)
+    for count_dict in count_dicts:
+        for term, count in count_dict.items():
+            merged[term] += count
+    return merged
+
+
 def merge_freqs(freq_dicts):
     """
     Merge multiple dictionaries of frequencies, representing each word with