From 20890901514ce950ba1c0cc8fc91d7a77b28e80e Mon Sep 17 00:00:00 2001
From: Andrew Lin <alin@luminoso.com>
Date: Wed, 2 Sep 2015 14:27:15 -0400
Subject: [PATCH 1/2] Remove the no-longer-existent .txt files from the
 MANIFEST.

Former-commit-id: db41bc790271ec2fe6f12f63a0a1d2f7ffed74fc
---
 MANIFEST.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 012f4ca..4f20a26 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,2 @@
 recursive-include wordfreq/data *.gz
 include README.md
-recursive-include wordfreq/data *.txt

From 319c3abaab253ed3b91f6b628b7252eb3162eb8f Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 8 Sep 2015 11:08:21 -0400
Subject: [PATCH 2/2] WIP: fix apostrophe trimming

Former-commit-id: e39d345c4bd2cf6fbd33872a9309109774056dca
---
 .../wordfreq_builder/word_counts.py           | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 9da95a3..f6cbe7d 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -133,3 +133,45 @@ def write_wordlist(freqs, filename, cutoff=1e-8):
                 break
             if not ('"' in word or ',' in word):
                 writer.writerow([word, str(freq)])
+
+
+# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
+# "'t" removed from it, based on counts from Twitter, which we know
+# accurate token counts for based on our own tokenizer.
+
+APOSTROPHE_TRIMMED_PROB = {
+    'don': 0.99,
+    'didn': 1.,
+    'can': 0.35,
+    'won': 0.74,
+    'isn': 1.,
+    'wasn': 1.,
+    'wouldn': 1.,
+    'doesn': 1.,
+    'couldn': 1.,
+    'ain': 0.99,
+    'aren': 1.,
+    'shouldn': 1.,
+    'haven': 0.96,
+    'weren': 1.,
+    'hadn': 1.,
+    'hasn': 1.,
+    'mustn': 1.,
+    'needn': 1.,
+}
+
+def correct_apostrophe_trimming(freqs):
+    """
+    If what we got was an English wordlist that has been tokenized with
+    apostrophes as token boundaries, correct the spurious tokens we get by
+    adding 't in about the proportion we expect to see in the wordlist.
+    """
+    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
+        print("Applying apostrophe trimming")
+        for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
+            if trim_word in freqs:
+                freq = freqs[trim_word]
+                freqs[trim_word] = freq * (1 - trim_prob)
+                freqs[trim_word + "'t"] = freq * trim_prob
+        return freqs
+