diff --git a/MANIFEST.in b/MANIFEST.in
index 012f4ca..4f20a26 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,2 @@
 recursive-include wordfreq/data *.gz
 include README.md
-recursive-include wordfreq/data *.txt
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index e4eab29..8d6c613 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -179,3 +179,44 @@ def write_jieba(freqs, filename):
                 fake_count = round(freq * 1e9)
                 print('%s %d' % (word, fake_count), file=outfile)
 
+
+# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
+# "'t" removed from it, based on counts from Twitter, which we know
+# accurate token counts for based on our own tokenizer.
+
+APOSTROPHE_TRIMMED_PROB = {
+    'don': 0.99,
+    'didn': 1.,
+    'can': 0.35,
+    'won': 0.74,
+    'isn': 1.,
+    'wasn': 1.,
+    'wouldn': 1.,
+    'doesn': 1.,
+    'couldn': 1.,
+    'ain': 0.99,
+    'aren': 1.,
+    'shouldn': 1.,
+    'haven': 0.96,
+    'weren': 1.,
+    'hadn': 1.,
+    'hasn': 1.,
+    'mustn': 1.,
+    'needn': 1.,
+}
+
+def correct_apostrophe_trimming(freqs):
+    """
+    If what we got was an English wordlist that has been tokenized with
+    apostrophes as token boundaries, correct the spurious tokens we get by
+    adding 't in about the proportion we expect to see in the wordlist.
+    """
+    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
+        print("Applying apostrophe trimming")
+        for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
+            if trim_word in freqs:
+                freq = freqs[trim_word]
+                freqs[trim_word] = freq * (1 - trim_prob)
+                freqs[trim_word + "'t"] = freq * trim_prob
+        return freqs
+