language-specific frequency reading; fix 't in English

This commit is contained in:
Rob Speer 2015-09-08 12:49:21 -04:00
parent 20f2828d0a
commit 9071defb33
4 changed files with 24 additions and 13 deletions

View File

@ -89,7 +89,7 @@ rule count
command = python -m wordfreq_builder.cli.count_tokens $in $out
rule merge
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out $in

View File

@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
import argparse
def merge_lists(input_names, output_name, cutoff):
def merge_lists(input_names, output_name, cutoff, lang):
freq_dicts = []
# Don't use Chinese tokenization while building wordlists, as that would
# create a circular dependency.
if lang == 'zh':
lang = None
for input_name in input_names:
freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang))
merged = merge_freqs(freq_dicts)
write_wordlist(merged, output_name)
@ -14,7 +20,8 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None)
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
args = parser.parse_args()
merge_lists(args.inputs, args.output, args.cutoff)
merge_lists(args.inputs, args.output, args.cutoff, args.language)

View File

@ -311,10 +311,11 @@ def combine_lists(languages):
output_file = wordlist_filename('combined', language)
add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py',
params={'cutoff': 2})
params={'cutoff': 2, 'lang': language})
output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz')
'combined-dist', language, 'msgpack.gz'
)
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py',
params={'lang': language})

View File

@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None):
If `cutoff` is greater than 0, the csv file must be sorted by value
in descending order.
If lang is given, it will apply language specific preprocessing
operations.
"""
values = defaultdict(float)
total = 0.
@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None):
for word in values:
values[word] /= total
return values
if lang == 'en':
return correct_apostrophe_trimming(values)
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = {
'needn': 1.,
}
def correct_apostrophe_trimming(freqs):
"""
If what we got was an English wordlist that has been tokenized with
apostrophes as token boundaries, correct the spurious tokens we get by
adding 't in about the proportion we expect to see in the wordlist.
apostrophes as token boundaries, as indicated by the frequencies of the
words "wouldn" and "couldn", then correct the spurious tokens we get by
adding "'t" in about the proportion we expect to see in the wordlist.
We could also adjust the frequency of "t", but then we would be favoring
the token "s" over it, as "'s" leaves behind no indication when it's been
removed.
"""
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
print("Applying apostrophe trimming")
@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs):
freqs[trim_word] = freq * (1 - trim_prob)
freqs[trim_word + "'t"] = freq * trim_prob
return freqs