Merge branch 'apostrophe-fix' into chinese-scripts

Conflicts: wordfreq_builder/wordfreq_builder/word_counts.py
2024-12-23 17:31:41 +00:00 · 2015-09-08 12:29:00 -04:00 · 2015-09-08 12:29:00 -04:00 · 20f2828d0a
commit 20f2828d0a
parent d576e3294b e39d345c4b
2 changed files with 41 additions and 1 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,2 @@
 recursive-include wordfreq/data *.gz
 include README.md
-recursive-include wordfreq/data *.txt
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -179,3 +179,44 @@ def write_jieba(freqs, filename):
                fake_count = round(freq * 1e9)
                print('%s %d' % (word, fake_count), file=outfile)

+
+# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
+# "'t" removed from it, based on counts from Twitter, which we know
+# accurate token counts for based on our own tokenizer.
+
+APOSTROPHE_TRIMMED_PROB = {
+    'don': 0.99,
+    'didn': 1.,
+    'can': 0.35,
+    'won': 0.74,
+    'isn': 1.,
+    'wasn': 1.,
+    'wouldn': 1.,
+    'doesn': 1.,
+    'couldn': 1.,
+    'ain': 0.99,
+    'aren': 1.,
+    'shouldn': 1.,
+    'haven': 0.96,
+    'weren': 1.,
+    'hadn': 1.,
+    'hasn': 1.,
+    'mustn': 1.,
+    'needn': 1.,
+}
+
+def correct_apostrophe_trimming(freqs):
+    """
+    If what we got was an English wordlist that has been tokenized with
+    apostrophes as token boundaries, correct the spurious tokens we get by
+    adding 't in about the proportion we expect to see in the wordlist.
+    """
+    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
+        print("Applying apostrophe trimming")
+        for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
+            if trim_word in freqs:
+                freq = freqs[trim_word]
+                freqs[trim_word] = freq * (1 - trim_prob)
+                freqs[trim_word + "'t"] = freq * trim_prob
+        return freqs
+