mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
language-specific frequency reading; fix 't in English
This commit is contained in:
parent
20f2828d0a
commit
9071defb33
@ -89,7 +89,7 @@ rule count
|
||||
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
||||
|
||||
rule merge
|
||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
|
||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
||||
|
||||
rule merge_counts
|
||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
||||
|
@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
|
||||
import argparse
|
||||
|
||||
|
||||
def merge_lists(input_names, output_name, cutoff):
|
||||
def merge_lists(input_names, output_name, cutoff, lang):
|
||||
freq_dicts = []
|
||||
|
||||
# Don't use Chinese tokenization while building wordlists, as that would
|
||||
# create a circular dependency.
|
||||
if lang == 'zh':
|
||||
lang = None
|
||||
|
||||
for input_name in input_names:
|
||||
freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
|
||||
freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang))
|
||||
merged = merge_freqs(freq_dicts)
|
||||
write_wordlist(merged, output_name)
|
||||
|
||||
@ -14,7 +20,8 @@ if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
||||
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
||||
parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None)
|
||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||
args = parser.parse_args()
|
||||
merge_lists(args.inputs, args.output, args.cutoff)
|
||||
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
||||
|
||||
|
@ -311,10 +311,11 @@ def combine_lists(languages):
|
||||
output_file = wordlist_filename('combined', language)
|
||||
add_dep(lines, 'merge', input_files, output_file,
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'cutoff': 2})
|
||||
params={'cutoff': 2, 'lang': language})
|
||||
|
||||
output_cBpack = wordlist_filename(
|
||||
'combined-dist', language, 'msgpack.gz')
|
||||
'combined-dist', language, 'msgpack.gz'
|
||||
)
|
||||
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'lang': language})
|
||||
|
@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None):
|
||||
|
||||
If `cutoff` is greater than 0, the csv file must be sorted by value
|
||||
in descending order.
|
||||
|
||||
If lang is given, it will apply language specific preprocessing
|
||||
operations.
|
||||
"""
|
||||
values = defaultdict(float)
|
||||
total = 0.
|
||||
@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None):
|
||||
for word in values:
|
||||
values[word] /= total
|
||||
|
||||
return values
|
||||
if lang == 'en':
|
||||
return correct_apostrophe_trimming(values)
|
||||
|
||||
|
||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||
@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = {
|
||||
'needn': 1.,
|
||||
}
|
||||
|
||||
|
||||
def correct_apostrophe_trimming(freqs):
|
||||
"""
|
||||
If what we got was an English wordlist that has been tokenized with
|
||||
apostrophes as token boundaries, correct the spurious tokens we get by
|
||||
adding 't in about the proportion we expect to see in the wordlist.
|
||||
apostrophes as token boundaries, as indicated by the frequencies of the
|
||||
words "wouldn" and "couldn", then correct the spurious tokens we get by
|
||||
adding "'t" in about the proportion we expect to see in the wordlist.
|
||||
|
||||
We could also adjust the frequency of "t", but then we would be favoring
|
||||
the token "s" over it, as "'s" leaves behind no indication when it's been
|
||||
removed.
|
||||
"""
|
||||
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
|
||||
print("Applying apostrophe trimming")
|
||||
@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs):
|
||||
freqs[trim_word] = freq * (1 - trim_prob)
|
||||
freqs[trim_word + "'t"] = freq * trim_prob
|
||||
return freqs
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user