diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 8084542..2a4fa0f 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -89,7 +89,7 @@ rule count command = python -m wordfreq_builder.cli.count_tokens $in $out rule merge - command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in + command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in rule merge_counts command = python -m wordfreq_builder.cli.merge_counts -o $out $in diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py index 0bbe1c1..8a914b9 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py @@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist import argparse -def merge_lists(input_names, output_name, cutoff): +def merge_lists(input_names, output_name, cutoff, lang): freq_dicts = [] + + # Don't use Chinese tokenization while building wordlists, as that would + # create a circular dependency. + if lang == 'zh': + lang = None + for input_name in input_names: - freq_dicts.append(read_freqs(input_name, cutoff=cutoff)) + freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang)) merged = merge_freqs(freq_dicts) write_wordlist(merged, output_name) @@ -14,7 +20,8 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) + parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None) parser.add_argument('inputs', help='names of input files to merge', nargs='+') args = parser.parse_args() - merge_lists(args.inputs, args.output, args.cutoff) + merge_lists(args.inputs, args.output, args.cutoff, args.language) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 038888d..5f9e59b 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -311,10 +311,11 @@ def combine_lists(languages): output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py', - params={'cutoff': 2}) + params={'cutoff': 2, 'lang': language}) output_cBpack = wordlist_filename( - 'combined-dist', language, 'msgpack.gz') + 'combined-dist', language, 'msgpack.gz' + ) add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language}) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 8d6c613..4bc0deb 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None): If `cutoff` is greater than 0, the csv file must be sorted by value in descending order. - - If lang is given, it will apply language specific preprocessing - operations. """ values = defaultdict(float) total = 0. @@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None): for word in values: values[word] /= total - return values + if lang == 'en': + return correct_apostrophe_trimming(values) def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): @@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = { 'needn': 1., } + def correct_apostrophe_trimming(freqs): """ If what we got was an English wordlist that has been tokenized with - apostrophes as token boundaries, correct the spurious tokens we get by - adding 't in about the proportion we expect to see in the wordlist. + apostrophes as token boundaries, as indicated by the frequencies of the + words "wouldn" and "couldn", then correct the spurious tokens we get by + adding "'t" in about the proportion we expect to see in the wordlist. + + We could also adjust the frequency of "t", but then we would be favoring + the token "s" over it, as "'s" leaves behind no indication when it's been + removed. """ if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6): print("Applying apostrophe trimming") @@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs): freqs[trim_word] = freq * (1 - trim_prob) freqs[trim_word + "'t"] = freq * trim_prob return freqs -