WIP: fix apostrophe trimming

Former-commit-id: e39d345c4b
2024-12-24 01:41:39 +00:00 · 2015-09-08 11:08:21 -04:00 · 2015-09-08 11:08:21 -04:00 · 319c3abaab
commit 319c3abaab
parent 42a7d5a439
1 changed files with 42 additions and 0 deletions
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -133,3 +133,45 @@ def write_wordlist(freqs, filename, cutoff=1e-8):
                break
            if not ('"' in word or ',' in word):
                writer.writerow([word, str(freq)])
 # APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
 # "'t" removed from it, based on counts from Twitter, which we know
 # accurate token counts for based on our own tokenizer.
 APOSTROPHE_TRIMMED_PROB = {
    'don': 0.99,
    'didn': 1.,
    'can': 0.35,
    'won': 0.74,
    'isn': 1.,
    'wasn': 1.,
    'wouldn': 1.,
    'doesn': 1.,
    'couldn': 1.,
    'ain': 0.99,
    'aren': 1.,
    'shouldn': 1.,
    'haven': 0.96,
    'weren': 1.,
    'hadn': 1.,
    'hasn': 1.,
    'mustn': 1.,
    'needn': 1.,
 }
 def correct_apostrophe_trimming(freqs):
    """
    If what we got was an English wordlist that has been tokenized with
    apostrophes as token boundaries, correct the spurious tokens we get by
    adding 't in about the proportion we expect to see in the wordlist.
    """
    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
        print("Applying apostrophe trimming")
        for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
            if trim_word in freqs:
                freq = freqs[trim_word]
                freqs[trim_word] = freq * (1 - trim_prob)
                freqs[trim_word + "'t"] = freq * trim_prob
        return freqs