Merge branch 'apostrophe-fix' into chinese-scripts

Conflicts:
	wordfreq_builder/wordfreq_builder/word_counts.py
This commit is contained in:
Rob Speer 2015-09-08 12:29:00 -04:00
commit 20f2828d0a
2 changed files with 41 additions and 1 deletions

View File

@ -1,3 +1,2 @@
recursive-include wordfreq/data *.gz recursive-include wordfreq/data *.gz
include README.md include README.md
recursive-include wordfreq/data *.txt

View File

@ -179,3 +179,44 @@ def write_jieba(freqs, filename):
fake_count = round(freq * 1e9) fake_count = round(freq * 1e9)
print('%s %d' % (word, fake_count), file=outfile) print('%s %d' % (word, fake_count), file=outfile)
# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
# "'t" removed from it, based on counts from Twitter, which we know
# accurate token counts for based on our own tokenizer.
APOSTROPHE_TRIMMED_PROB = {
'don': 0.99,
'didn': 1.,
'can': 0.35,
'won': 0.74,
'isn': 1.,
'wasn': 1.,
'wouldn': 1.,
'doesn': 1.,
'couldn': 1.,
'ain': 0.99,
'aren': 1.,
'shouldn': 1.,
'haven': 0.96,
'weren': 1.,
'hadn': 1.,
'hasn': 1.,
'mustn': 1.,
'needn': 1.,
}
def correct_apostrophe_trimming(freqs):
"""
If what we got was an English wordlist that has been tokenized with
apostrophes as token boundaries, correct the spurious tokens we get by
adding 't in about the proportion we expect to see in the wordlist.
"""
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
print("Applying apostrophe trimming")
for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
if trim_word in freqs:
freq = freqs[trim_word]
freqs[trim_word] = freq * (1 - trim_prob)
freqs[trim_word + "'t"] = freq * trim_prob
return freqs