diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 032d80a..a1dc1c7 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -35,7 +35,11 @@ rule tokenize_twitter
 # This rule uses command-line tools to take in a file with one token per line,
 # and output a comma-separated file with the token counts:
 #
-#   * 'sort $in | uniq -c' does the actual counting.
+#   * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
+#     order, which is fine because we only need this order so we can run
+#     'uniq'.
+#   * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
+#     large amount of temporary output goes in $tmp.
 #   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
 #     (the count).
 #   * The 'sed' command rearranges the lines to be comma-separated values with
@@ -43,7 +47,7 @@ rule tokenize_twitter
 #     number at the start of the line.
 #
 rule count
-  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
+  command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
 
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index ab74ec8..1c58154 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -5,6 +5,7 @@ import pathlib
 HEADER = """# This file is automatically generated. Do not edit it.
 # You can regenerate it using the 'wordfreq-build-deps' command.
 """
+TMPDIR = data_filename('tmp')
 
 
 def make_ninja_deps(rules_filename, out=sys.stdout):
@@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
         print(rulesfile.read(), file=out)
 
     lines = (
-        language_detect_and_tokenize_deps(
+        twitter_deps(
             data_filename('raw-input/twitter/all-2014.txt'),
             slice_prefix=data_filename('slices/twitter/tweets-2014'),
-            combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
             slices=40
         ) +
-        wiki_parse_deps(
+        wikipedia_deps(
             data_filename('raw-input/wikipedia'),
             data_filename('generated/wikipedia'),
             CONFIG['wp_languages']
@@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
     print('\n'.join(lines), file=out)
 
 
-def wiki_parse_deps(dirname_in, dirname_out, languages):
+def wikipedia_deps(dirname_in, dirname_out, languages):
     lines = []
     path_in = pathlib.Path(dirname_in)
     path_out = pathlib.Path(dirname_out)
@@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
             outs=output_file, ins=input_file
         )
         lines.append(build_rule)
+
+        token_file = output_file
+        output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
+        build_rule = "build {outs}: count {ins}".format(
+            outs=output_file, ins=token_file
+        )
+        lines.append(build_rule)
+        lines.append("  tmp = {}".format(TMPDIR))
     return lines
 
 
-def language_detect_and_tokenize_deps(input_filename, slice_prefix,
-                                      combined_prefix, slices):
+def twitter_deps(input_filename, slice_prefix,
+                 combined_prefix, slices):
     lines = []
     # split the input into slices
     slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)