diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 8084542..2a4fa0f 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -89,7 +89,7 @@ rule count
   command = python -m wordfreq_builder.cli.count_tokens $in $out
 
 rule merge
-  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
+  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
 
 rule merge_counts
   command = python -m wordfreq_builder.cli.merge_counts -o $out $in
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
index 0bbe1c1..8a914b9 100644
--- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
 import argparse
 
 
-def merge_lists(input_names, output_name, cutoff):
+def merge_lists(input_names, output_name, cutoff, lang):
     freq_dicts = []
+
+    # Don't use Chinese tokenization while building wordlists, as that would
+    # create a circular dependency.
+    if lang == 'zh':
+        lang = None
+
     for input_name in input_names:
-        freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
+        freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang))
     merged = merge_freqs(freq_dicts)
     write_wordlist(merged, output_name)
 
@@ -14,7 +20,8 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
     parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
+    parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None)
     parser.add_argument('inputs', help='names of input files to merge', nargs='+')
     args = parser.parse_args()
-    merge_lists(args.inputs, args.output, args.cutoff)
+    merge_lists(args.inputs, args.output, args.cutoff, args.language)
 
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 038888d..5f9e59b 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -311,10 +311,11 @@ def combine_lists(languages):
         output_file = wordlist_filename('combined', language)
         add_dep(lines, 'merge', input_files, output_file,
                 extra='wordfreq_builder/word_counts.py',
-                params={'cutoff': 2})
+                params={'cutoff': 2, 'lang': language})
 
         output_cBpack = wordlist_filename(
-            'combined-dist', language, 'msgpack.gz')
+            'combined-dist', language, 'msgpack.gz'
+        )
         add_dep(lines, 'freqs2cB', output_file, output_cBpack,
                 extra='wordfreq_builder/word_counts.py',
                 params={'lang': language})
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 8d6c613..4bc0deb 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None):
 
     If `cutoff` is greater than 0, the csv file must be sorted by value
     in descending order.
-
-    If lang is given, it will apply language specific preprocessing
-    operations.
     """
     values = defaultdict(float)
     total = 0.
@@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None):
     for word in values:
         values[word] /= total
 
-    return values
+    if lang == 'en':
+        return correct_apostrophe_trimming(values)
 
 
 def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
@@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = {
     'needn': 1.,
 }
 
+
 def correct_apostrophe_trimming(freqs):
     """
     If what we got was an English wordlist that has been tokenized with
-    apostrophes as token boundaries, correct the spurious tokens we get by
-    adding 't in about the proportion we expect to see in the wordlist.
+    apostrophes as token boundaries, as indicated by the frequencies of the
+    words "wouldn" and "couldn", then correct the spurious tokens we get by
+    adding "'t" in about the proportion we expect to see in the wordlist.
+
+    We could also adjust the frequency of "t", but then we would be favoring
+    the token "s" over it, as "'s" leaves behind no indication when it's been
+    removed.
     """
     if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
         print("Applying apostrophe trimming")
@@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs):
                 freqs[trim_word] = freq * (1 - trim_prob)
                 freqs[trim_word + "'t"] = freq * trim_prob
         return freqs
-