mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 02:05:24 +00:00
Merge branch 'apostrophe-fix' into chinese-scripts
Conflicts:
wordfreq_builder/wordfreq_builder/word_counts.py
Former-commit-id: 20f2828d0a
This commit is contained in:
commit
30237cf73d
@ -1,3 +1,2 @@
|
|||||||
recursive-include wordfreq/data *.gz
|
recursive-include wordfreq/data *.gz
|
||||||
include README.md
|
include README.md
|
||||||
recursive-include wordfreq/data *.txt
|
|
||||||
|
@ -179,3 +179,44 @@ def write_jieba(freqs, filename):
|
|||||||
fake_count = round(freq * 1e9)
|
fake_count = round(freq * 1e9)
|
||||||
print('%s %d' % (word, fake_count), file=outfile)
|
print('%s %d' % (word, fake_count), file=outfile)
|
||||||
|
|
||||||
|
|
||||||
|
# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had
|
||||||
|
# "'t" removed from it, based on counts from Twitter, which we know
|
||||||
|
# accurate token counts for based on our own tokenizer.
|
||||||
|
|
||||||
|
APOSTROPHE_TRIMMED_PROB = {
|
||||||
|
'don': 0.99,
|
||||||
|
'didn': 1.,
|
||||||
|
'can': 0.35,
|
||||||
|
'won': 0.74,
|
||||||
|
'isn': 1.,
|
||||||
|
'wasn': 1.,
|
||||||
|
'wouldn': 1.,
|
||||||
|
'doesn': 1.,
|
||||||
|
'couldn': 1.,
|
||||||
|
'ain': 0.99,
|
||||||
|
'aren': 1.,
|
||||||
|
'shouldn': 1.,
|
||||||
|
'haven': 0.96,
|
||||||
|
'weren': 1.,
|
||||||
|
'hadn': 1.,
|
||||||
|
'hasn': 1.,
|
||||||
|
'mustn': 1.,
|
||||||
|
'needn': 1.,
|
||||||
|
}
|
||||||
|
|
||||||
|
def correct_apostrophe_trimming(freqs):
|
||||||
|
"""
|
||||||
|
If what we got was an English wordlist that has been tokenized with
|
||||||
|
apostrophes as token boundaries, correct the spurious tokens we get by
|
||||||
|
adding 't in about the proportion we expect to see in the wordlist.
|
||||||
|
"""
|
||||||
|
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
|
||||||
|
print("Applying apostrophe trimming")
|
||||||
|
for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items():
|
||||||
|
if trim_word in freqs:
|
||||||
|
freq = freqs[trim_word]
|
||||||
|
freqs[trim_word] = freq * (1 - trim_prob)
|
||||||
|
freqs[trim_word + "'t"] = freq * trim_prob
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user