add more SUBTLEX and fix its build rules

Former-commit-id: 34474939f2
2024-12-23 09:21:37 +00:00 · 2015-09-04 12:37:35 -04:00 · 2015-09-04 12:37:35 -04:00 · d0ada70355
commit d0ada70355
parent 8035df998a
6 changed files with 79 additions and 30 deletions
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@ -174,14 +174,23 @@ following reasonable conditions:

 `data/source-lists/subtlex` contains the following files:

- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us],
+- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
+  and exported from Excel format to tab-separated UTF-8 using LibreOffice
+- `subtlex.el.txt`, which was downloaded as [SUBTLEX-GR\_CD.txt][subtlex-gr]
+- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
  extracted, and converted from ISO-8859-1 to UTF-8
- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8
-  from [this Excel file][subtlex-uk]
- `subtlex.zh.txt`, which was downloaded and extracted from
-  [here][subtlex-ch]
+- `subtlex.en-GB.txt`, which was downloaded as
+  [SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
+  tab-separated UTF-8 using LibreOffice
+- `subtlex.nl.txt`, which was downloaded as
+  [SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
+- `subtlex.zh.txt`, which was downloaded as
+  [subtlexch131210.zip][subtlex-ch] and extracted

+[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
+[subtlex-gr]: http://www.bcbl.eu/bcbl-corporativa/wp-content/uploads/2013/01/SUBTLEX-GR_CD.txt
 [subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
 [subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
+[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
 [subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip

--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -56,10 +56,11 @@ rule convert_leeds
 rule convert_opensubtitles
  command = tr ' ' ',' < $in > $out

-# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
-# tabs to commas and commas to nothing, and remove obvious mojibake.
+# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
+# run it through ftfy, convert tabs to commas and spurious CSV formatting to
+# and remove lines with unfixable half-mojibake.
 rule convert_subtlex
-  command = cut -f 1,$col $in | tail -n +2 | tr '	,' ', ' | grep -v 'â,' > $out
+  command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr '	",' ',  ' | grep -v 'â,' > $out

 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@ -1,11 +1,12 @@
-from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
+from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist
 import argparse


 def merge_lists(input_names, output_name):
    count_dicts = []
    for input_name in input_names:
-        count_dicts.append(read_freqs(input_name, cutoff=0))
+        values, total = read_values(input_name, cutoff=0)
+        count_dicts.append(values)
    merged = merge_counts(count_dicts)
    write_wordlist(merged, output_name)

--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -13,6 +13,7 @@ CONFIG = {
        # 'th' when we get tokenization for it
        # 'hi' when we stop messing up its tokenization
        # 'tl' because it's probably ready right now
+        # 'pl' because we have 3 sources for it
        'twitter': [
            'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
            'pt', 'ru', 'tr'
@ -38,7 +39,7 @@ CONFIG = {
            # Russian, Spanish, and (Simplified) Chinese.
        ],
        'subtlex-en': ['en'],
-        'subtlex-zh': ['zh'],
+        'subtlex-other': ['de', 'el', 'nl', 'zh'],
    },
    # Subtlex languages that need to be pre-processed
    'wordlist_paths': {
@ -48,7 +49,7 @@ CONFIG = {
        'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
        'google-books': 'generated/google-books/google_books_{lang}.{ext}',
        'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
-        'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
+        'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
        'combined': 'generated/combined/combined_{lang}.{ext}',
        'combined-dist': 'dist/combined_{lang}.{ext}',
        'twitter-dist': 'dist/twitter_{lang}.{ext}'
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -84,9 +84,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
        )
    )
    lines.extend(
-        subtlex_zh_deps(
+        subtlex_other_deps(
            data_filename('source-lists/subtlex'),
-            CONFIG['sources']['subtlex-zh']
+            CONFIG['sources']['subtlex-other']
        )
    )
    lines.extend(combine_lists(all_languages()))
@ -208,6 +208,17 @@ def opensubtitles_deps(dirname_in, languages):
    return lines


+# Which columns of the SUBTLEX data files do the word and its frequency appear
+# in?
+SUBTLEX_COLUMN_MAP = {
+    'de': (1, 3),
+    'el': (2, 3),
+    'en': (1, 2),
+    'nl': (1, 2),
+    'zh': (1, 5)
+}
+
+
 def subtlex_en_deps(dirname_in, languages):
    lines = []
    assert languages == ['en']
@ -217,11 +228,12 @@ def subtlex_en_deps(dirname_in, languages):
        input_file = '{prefix}/subtlex.{region}.txt'.format(
            prefix=dirname_in, region=region
        )
+        textcol, freqcol = SUBTLEX_COLUMN_MAP['en']
        processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
        processed_files.append(processed_file)
        add_dep(
            lines, 'convert_subtlex', input_file, processed_file,
-            params={'col': 2}
+            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
        )

    output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
@ -230,17 +242,25 @@ def subtlex_en_deps(dirname_in, languages):
    return lines


-def subtlex_zh_deps(dirname_in, languages):
+def subtlex_other_deps(dirname_in, languages):
    lines = []
    for language in languages:
        input_file = '{prefix}/subtlex.{lang}.txt'.format(
            prefix=dirname_in, lang=language
        )
-        processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
-        output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
+        processed_file = wordlist_filename('subtlex-other', language, 'processed.txt')
+        output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
+        textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
+
+        # Greek has three extra header lines for no reason
+        if language == 'el':
+            startrow = 5
+        else:
+            startrow = 2
+
        add_dep(
            lines, 'convert_subtlex', input_file, processed_file,
-            params={'col': 5}
+            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
        )
        add_dep(
            lines, 'merge_counts', processed_file, output_file
@ -259,7 +279,7 @@ def combine_lists(languages):
        output_file = wordlist_filename('combined', language)
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py',
-                params={'cutoff': 2})
+                params={'cutoff': 0})

        output_cBpack = wordlist_filename(
            'combined-dist', language, 'msgpack.gz')
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -32,19 +32,20 @@ def count_tokens(filename):
    return counts


-def read_freqs(filename, cutoff=0, lang=None):
+def read_values(filename, cutoff=0, lang=None):
    """
-    Read words and their frequencies from a CSV file.
+    Read words and their frequency or count values from a CSV file. Returns
+    a dictionary of values and the total of all values.

-    Only words with a frequency greater than or equal to `cutoff` are returned.
+    Only words with a value greater than or equal to `cutoff` are returned.

-    If `cutoff` is greater than 0, the csv file must be sorted by frequency
+    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.

-    If lang is given, read_freqs will apply language specific preprocessing
+    If lang is given, it will apply language specific preprocessing
    operations.
    """
-    raw_counts = defaultdict(float)
+    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
@ -56,13 +57,29 @@ def read_freqs(filename, cutoff=0, lang=None):
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
-                raw_counts[token] += val
+                values[token] += val
                total += val
+    return values, total

-    for word in raw_counts:
-        raw_counts[word] /= total

-    return raw_counts
+def read_freqs(filename, cutoff=0, lang=None):
+    """
+    Read words and their frequencies from a CSV file, normalizing the
+    frequencies to add up to 1.
+
+    Only words with a frequency greater than or equal to `cutoff` are returned.
+
+    If `cutoff` is greater than 0, the csv file must be sorted by frequency
+    in descending order.
+
+    If lang is given, read_freqs will apply language specific preprocessing
+    operations.
+    """
+    values, total = read_values(filename, cutoff, lang)
+    for word in values:
+        values[word] /= total
+
+    return values


 def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):