diff --git a/MANIFEST.in b/MANIFEST.in index 012f4ca..4f20a26 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ recursive-include wordfreq/data *.gz include README.md -recursive-include wordfreq/data *.txt diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index e4eab29..8d6c613 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -179,3 +179,44 @@ def write_jieba(freqs, filename): fake_count = round(freq * 1e9) print('%s %d' % (word, fake_count), file=outfile) + +# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had +# "'t" removed from it, based on counts from Twitter, which we know +# accurate token counts for based on our own tokenizer. + +APOSTROPHE_TRIMMED_PROB = { + 'don': 0.99, + 'didn': 1., + 'can': 0.35, + 'won': 0.74, + 'isn': 1., + 'wasn': 1., + 'wouldn': 1., + 'doesn': 1., + 'couldn': 1., + 'ain': 0.99, + 'aren': 1., + 'shouldn': 1., + 'haven': 0.96, + 'weren': 1., + 'hadn': 1., + 'hasn': 1., + 'mustn': 1., + 'needn': 1., +} + +def correct_apostrophe_trimming(freqs): + """ + If what we got was an English wordlist that has been tokenized with + apostrophes as token boundaries, correct the spurious tokens we get by + adding 't in about the proportion we expect to see in the wordlist. + """ + if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6): + print("Applying apostrophe trimming") + for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items(): + if trim_word in freqs: + freq = freqs[trim_word] + freqs[trim_word] = freq * (1 - trim_prob) + freqs[trim_word + "'t"] = freq * trim_prob + return freqs +