mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
language-specific frequency reading; fix 't in English
This commit is contained in:
parent
20f2828d0a
commit
9071defb33
@ -89,7 +89,7 @@ rule count
|
|||||||
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
||||||
|
|
||||||
rule merge
|
rule merge
|
||||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
|
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
||||||
|
|
||||||
rule merge_counts
|
rule merge_counts
|
||||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
||||||
|
@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name, cutoff):
|
def merge_lists(input_names, output_name, cutoff, lang):
|
||||||
freq_dicts = []
|
freq_dicts = []
|
||||||
|
|
||||||
|
# Don't use Chinese tokenization while building wordlists, as that would
|
||||||
|
# create a circular dependency.
|
||||||
|
if lang == 'zh':
|
||||||
|
lang = None
|
||||||
|
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
|
freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang))
|
||||||
merged = merge_freqs(freq_dicts)
|
merged = merge_freqs(freq_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
|
|
||||||
@ -14,7 +20,8 @@ if __name__ == '__main__':
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
||||||
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
||||||
|
parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None)
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output, args.cutoff)
|
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
||||||
|
|
||||||
|
@ -311,10 +311,11 @@ def combine_lists(languages):
|
|||||||
output_file = wordlist_filename('combined', language)
|
output_file = wordlist_filename('combined', language)
|
||||||
add_dep(lines, 'merge', input_files, output_file,
|
add_dep(lines, 'merge', input_files, output_file,
|
||||||
extra='wordfreq_builder/word_counts.py',
|
extra='wordfreq_builder/word_counts.py',
|
||||||
params={'cutoff': 2})
|
params={'cutoff': 2, 'lang': language})
|
||||||
|
|
||||||
output_cBpack = wordlist_filename(
|
output_cBpack = wordlist_filename(
|
||||||
'combined-dist', language, 'msgpack.gz')
|
'combined-dist', language, 'msgpack.gz'
|
||||||
|
)
|
||||||
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||||
extra='wordfreq_builder/word_counts.py',
|
extra='wordfreq_builder/word_counts.py',
|
||||||
params={'lang': language})
|
params={'lang': language})
|
||||||
|
@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None):
|
|||||||
|
|
||||||
If `cutoff` is greater than 0, the csv file must be sorted by value
|
If `cutoff` is greater than 0, the csv file must be sorted by value
|
||||||
in descending order.
|
in descending order.
|
||||||
|
|
||||||
If lang is given, it will apply language specific preprocessing
|
|
||||||
operations.
|
|
||||||
"""
|
"""
|
||||||
values = defaultdict(float)
|
values = defaultdict(float)
|
||||||
total = 0.
|
total = 0.
|
||||||
@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
for word in values:
|
for word in values:
|
||||||
values[word] /= total
|
values[word] /= total
|
||||||
|
|
||||||
return values
|
if lang == 'en':
|
||||||
|
return correct_apostrophe_trimming(values)
|
||||||
|
|
||||||
|
|
||||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||||
@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = {
|
|||||||
'needn': 1.,
|
'needn': 1.,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def correct_apostrophe_trimming(freqs):
|
def correct_apostrophe_trimming(freqs):
|
||||||
"""
|
"""
|
||||||
If what we got was an English wordlist that has been tokenized with
|
If what we got was an English wordlist that has been tokenized with
|
||||||
apostrophes as token boundaries, correct the spurious tokens we get by
|
apostrophes as token boundaries, as indicated by the frequencies of the
|
||||||
adding 't in about the proportion we expect to see in the wordlist.
|
words "wouldn" and "couldn", then correct the spurious tokens we get by
|
||||||
|
adding "'t" in about the proportion we expect to see in the wordlist.
|
||||||
|
|
||||||
|
We could also adjust the frequency of "t", but then we would be favoring
|
||||||
|
the token "s" over it, as "'s" leaves behind no indication when it's been
|
||||||
|
removed.
|
||||||
"""
|
"""
|
||||||
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
|
if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
|
||||||
print("Applying apostrophe trimming")
|
print("Applying apostrophe trimming")
|
||||||
@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs):
|
|||||||
freqs[trim_word] = freq * (1 - trim_prob)
|
freqs[trim_word] = freq * (1 - trim_prob)
|
||||||
freqs[trim_word + "'t"] = freq * trim_prob
|
freqs[trim_word + "'t"] = freq * trim_prob
|
||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user