mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge branch 'apostrophe-fix' into chinese-scripts
Conflicts: wordfreq_builder/wordfreq_builder/word_counts.py
This commit is contained in:
commit
20f2828d0a
@ -1,3 +1,2 @@
|
||||
recursive-include wordfreq/data *.gz
|
||||
include README.md
|
||||
recursive-include wordfreq/data *.txt
|
||||
|
@ -179,3 +179,44 @@ def write_jieba(freqs, filename):
|
||||
fake_count = round(freq * 1e9)
|
||||
print('%s %d' % (word, fake_count), file=outfile)
|
||||
|
||||
|
||||
# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
|
||||
# "'t" removed from it, based on counts from Twitter, which we know
|
||||
# accurate token counts for based on our own tokenizer.
|
||||
|
||||
APOSTROPHE_TRIMMED_PROB = {
|
||||
'don': 0.99,
|
||||
'didn': 1.,
|
||||
'can': 0.35,
|
||||
'won': 0.74,
|
||||
'isn': 1.,
|
||||
'wasn': 1.,
|
||||
'wouldn': 1.,
|
||||
'doesn': 1.,
|
||||
'couldn': 1.,
|
||||
'ain': 0.99,
|
||||
'aren': 1.,
|
||||
'shouldn': 1.,
|
||||
'haven': 0.96,
|
||||
'weren': 1.,
|
||||
'hadn': 1.,
|
||||
'hasn': 1.,
|
||||
'mustn': 1.,
|
||||
'needn': 1.,
|
||||
}
|
||||
|
||||
def correct_apostrophe_trimming(freqs):
|
||||
"""
|
||||
If what we got was an English wordlist that has been tokenized with
|
||||
apostrophes as token boundaries, correct the spurious tokens we get by
|
||||
adding 't in about the proportion we expect to see in the wordlist.
|
||||
"""
|
||||
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
|
||||
print("Applying apostrophe trimming")
|
||||
for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
|
||||
if trim_word in freqs:
|
||||
freq = freqs[trim_word]
|
||||
freqs[trim_word] = freq * (1 - trim_prob)
|
||||
freqs[trim_word + "'t"] = freq * trim_prob
|
||||
return freqs
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user