language-specific frequency reading; fix 't in English

2024-12-23 17:31:41 +00:00 · 2015-09-08 12:49:21 -04:00 · 2015-09-08 12:49:21 -04:00 · 9071defb33
commit 9071defb33
parent 20f2828d0a
4 changed files with 24 additions and 13 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -89,7 +89,7 @@ rule count
  command = python -m wordfreq_builder.cli.count_tokens $in $out

 rule merge
-  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff $in
+  command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in

 rule merge_counts
  command = python -m wordfreq_builder.cli.merge_counts -o $out $in
--- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@ -2,10 +2,16 @@ from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
 import argparse


-def merge_lists(input_names, output_name, cutoff):
+def merge_lists(input_names, output_name, cutoff, lang):
    freq_dicts = []
+
+    # Don't use Chinese tokenization while building wordlists, as that would
+    # create a circular dependency.
+    if lang == 'zh':
+        lang = None
+
    for input_name in input_names:
-        freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
+        freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang))
    merged = merge_freqs(freq_dicts)
    write_wordlist(merged, output_name)

@ -14,7 +20,8 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
+    parser.add_argument('-l', '--language', 'language code for which language the words are in', default=None)
    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
    args = parser.parse_args()
-    merge_lists(args.inputs, args.output, args.cutoff)
+    merge_lists(args.inputs, args.output, args.cutoff, args.language)

--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -311,10 +311,11 @@ def combine_lists(languages):
        output_file = wordlist_filename('combined', language)
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py',
-                params={'cutoff': 2})
+                params={'cutoff': 2, 'lang': language})

        output_cBpack = wordlist_filename(
-            'combined-dist', language, 'msgpack.gz')
+            'combined-dist', language, 'msgpack.gz'
+        )
        add_dep(lines, 'freqs2cB', output_file, output_cBpack,
                extra='wordfreq_builder/word_counts.py',
                params={'lang': language})
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -42,9 +42,6 @@ def read_values(filename, cutoff=0, lang=None):

    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.
-
-    If lang is given, it will apply language specific preprocessing
-    operations.
    """
    values = defaultdict(float)
    total = 0.
@ -80,7 +77,8 @@ def read_freqs(filename, cutoff=0, lang=None):
    for word in values:
        values[word] /= total

-    return values
+    if lang == 'en':
+        return correct_apostrophe_trimming(values)


 def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
@ -205,11 +203,17 @@ APOSTROPHE_TRIMMED_PROB = {
    'needn': 1.,
 }

+
 def correct_apostrophe_trimming(freqs):
    """
    If what we got was an English wordlist that has been tokenized with
-    apostrophes as token boundaries, correct the spurious tokens we get by
-    adding 't in about the proportion we expect to see in the wordlist.
+    apostrophes as token boundaries, as indicated by the frequencies of the
+    words "wouldn" and "couldn", then correct the spurious tokens we get by
+    adding "'t" in about the proportion we expect to see in the wordlist.
+
+    We could also adjust the frequency of "t", but then we would be favoring
+    the token "s" over it, as "'s" leaves behind no indication when it's been
+    removed.
    """
    if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6):
        print("Applying apostrophe trimming")
@ -219,4 +223,3 @@ def correct_apostrophe_trimming(freqs):
                freqs[trim_word] = freq * (1 - trim_prob)
                freqs[trim_word + "'t"] = freq * trim_prob
        return freqs
-