Merge branch 'apostrophe-fix' into chinese-scripts

Conflicts: wordfreq_builder/wordfreq_builder/word_counts.py
2024-12-23 17:31:41 +00:00 · 2015-09-08 12:29:00 -04:00 · 2015-09-08 12:29:00 -04:00 · 20f2828d0a
commit 20f2828d0a
parent d576e3294b e39d345c4b
2 changed files with 41 additions and 1 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,2 @@
 recursive-include wordfreq/data *.gz
 include README.md
 recursive-include wordfreq/data *.txt
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -179,3 +179,44 @@ def write_jieba(freqs, filename):
                fake_count = round(freq * 1e9)
                print('%s %d' % (word, fake_count), file=outfile)
 # APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
 # "'t" removed from it, based on counts from Twitter, which we know
 # accurate token counts for based on our own tokenizer.
 APOSTROPHE_TRIMMED_PROB = {
    'don': 0.99,
    'didn': 1.,
    'can': 0.35,
    'won': 0.74,
    'isn': 1.,
    'wasn': 1.,
    'wouldn': 1.,
    'doesn': 1.,
    'couldn': 1.,
    'ain': 0.99,
    'aren': 1.,
    'shouldn': 1.,
    'haven': 0.96,
    'weren': 1.,
    'hadn': 1.,
    'hasn': 1.,
    'mustn': 1.,
    'needn': 1.,
 }
 def correct_apostrophe_trimming(freqs):
    """
    If what we got was an English wordlist that has been tokenized with
    apostrophes as token boundaries, correct the spurious tokens we get by
    adding 't in about the proportion we expect to see in the wordlist.
    """
    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
        print("Applying apostrophe trimming")
        for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
            if trim_word in freqs:
                freq = freqs[trim_word]
                freqs[trim_word] = freq * (1 - trim_prob)
                freqs[trim_word + "'t"] = freq * trim_prob
        return freqs