language-specific frequency reading; fix 't in English

2024-12-23 17:31:41 +00:00 · 2015-09-08 12:49:21 -04:00 · 2015-09-08 12:49:21 -04:00 · 9071defb33
commit 9071defb33
parent 20f2828d0a
4 changed files with 24 additions and 13 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -89,7 +89,7 @@ rule count
  command = python -m wordfreq_builder.cli.count_tokens $in $out
 rule merge
-  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
+  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
 rule merge_counts
  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
--- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
 import argparse
-def merge_lists(input_names, output_name, cutoff):
+def merge_lists(input_names, output_name, cutoff, lang):
    freq_dicts = []
    # Don't use Chinese tokenization while building wordlists, as that would
    # create a circular dependency.
    if lang == 'zh':
        lang = None
    for input_name in input_names:
-        freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
+        freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang))
    merged = merge_freqs(freq_dicts)
    write_wordlist(merged, output_name)
@ -14,7 +20,8 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
    parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None)
    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
    args = parser.parse_args()
-    merge_lists(args.inputs, args.output, args.cutoff)
+    merge_lists(args.inputs, args.output, args.cutoff, args.language)
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -311,10 +311,11 @@ def combine_lists(languages):
        output_file = wordlist_filename('combined', language)
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py',
-                params={'cutoff': 2})
+                params={'cutoff': 2, 'lang': language})
        output_cBpack = wordlist_filename(
-            'combined-dist', language, 'msgpack.gz')
+            'combined-dist', language, 'msgpack.gz'
        )
        add_dep(lines, 'freqs2cB', output_file, output_cBpack,
                extra='wordfreq_builder/word_counts.py',
                params={'lang': language})
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None):
    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.
    If lang is given, it will apply language specific preprocessing
    operations.
    """
    values = defaultdict(float)
    total = 0.
@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None):
    for word in values:
        values[word] /= total
-    return values
+    if lang == 'en':
        return correct_apostrophe_trimming(values)
 def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = {
    'needn': 1.,
 }
 def correct_apostrophe_trimming(freqs):
    """
    If what we got was an English wordlist that has been tokenized with
-    apostrophes as token boundaries, correct the spurious tokens we get by
+    apostrophes as token boundaries, as indicated by the frequencies of the
-    adding 't in about the proportion we expect to see in the wordlist.
+    words "wouldn" and "couldn", then correct the spurious tokens we get by
    adding "'t" in about the proportion we expect to see in the wordlist.
    We could also adjust the frequency of "t", but then we would be favoring
    the token "s" over it, as "'s" leaves behind no indication when it's been
    removed.
    """
    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
        print("Applying apostrophe trimming")
@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs):
                freqs[trim_word] = freq * (1 - trim_prob)
                freqs[trim_word + "'t"] = freq * trim_prob
        return freqs