From 20890901514ce950ba1c0cc8fc91d7a77b28e80e Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Wed, 2 Sep 2015 14:27:15 -0400 Subject: [PATCH 1/2] Remove the no-longer-existent .txt files from the MANIFEST. Former-commit-id: db41bc790271ec2fe6f12f63a0a1d2f7ffed74fc --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 012f4ca..4f20a26 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ recursive-include wordfreq/data *.gz include README.md -recursive-include wordfreq/data *.txt From 319c3abaab253ed3b91f6b628b7252eb3162eb8f Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 8 Sep 2015 11:08:21 -0400 Subject: [PATCH 2/2] WIP: fix apostrophe trimming Former-commit-id: e39d345c4bd2cf6fbd33872a9309109774056dca --- .../wordfreq_builder/word_counts.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 9da95a3..f6cbe7d 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -133,3 +133,45 @@ def write_wordlist(freqs, filename, cutoff=1e-8): break if not ('"' in word or ',' in word): writer.writerow([word, str(freq)]) + + +# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had +# "'t" removed from it, based on counts from Twitter, which we know +# accurate token counts for based on our own tokenizer. + +APOSTROPHE_TRIMMED_PROB = { + 'don': 0.99, + 'didn': 1., + 'can': 0.35, + 'won': 0.74, + 'isn': 1., + 'wasn': 1., + 'wouldn': 1., + 'doesn': 1., + 'couldn': 1., + 'ain': 0.99, + 'aren': 1., + 'shouldn': 1., + 'haven': 0.96, + 'weren': 1., + 'hadn': 1., + 'hasn': 1., + 'mustn': 1., + 'needn': 1., +} + +def correct_apostrophe_trimming(freqs): + """ + If what we got was an English wordlist that has been tokenized with + apostrophes as token boundaries, correct the spurious tokens we get by + adding 't in about the proportion we expect to see in the wordlist. + """ + if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6): + print("Applying apostrophe trimming") + for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items(): + if trim_word in freqs: + freq = freqs[trim_word] + freqs[trim_word] = freq * (1 - trim_prob) + freqs[trim_word + "'t"] = freq * trim_prob + return freqs +