From 34474939f2bad73348737af419f362355cdabf19 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Fri, 4 Sep 2015 12:37:35 -0400
Subject: [PATCH] add more SUBTLEX and fix its build rules

---
 wordfreq_builder/README.md                    | 19 +++++++---
 wordfreq_builder/rules.ninja                  |  7 ++--
 .../wordfreq_builder/cli/merge_counts.py      |  5 ++-
 wordfreq_builder/wordfreq_builder/config.py   |  5 ++-
 wordfreq_builder/wordfreq_builder/ninja.py    | 36 ++++++++++++++----
 .../wordfreq_builder/word_counts.py           | 37 ++++++++++++++-----
 6 files changed, 79 insertions(+), 30 deletions(-)

diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md
index 021bc0f..f2fdfb9 100644
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@@ -174,14 +174,23 @@ following reasonable conditions:
 
 `data/source-lists/subtlex` contains the following files:
 
-- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us],
+- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
+  and exported from Excel format to tab-separated UTF-8 using LibreOffice
+- `subtlex.el.txt`, which was downloaded as [SUBTLEX-GR\_CD.txt][subtlex-gr]
+- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
   extracted, and converted from ISO-8859-1 to UTF-8
-- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8
-  from [this Excel file][subtlex-uk]
-- `subtlex.zh.txt`, which was downloaded and extracted from
-  [here][subtlex-ch]
+- `subtlex.en-GB.txt`, which was downloaded as
+  [SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
+  tab-separated UTF-8 using LibreOffice
+- `subtlex.nl.txt`, which was downloaded as
+  [SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
+- `subtlex.zh.txt`, which was downloaded as
+  [subtlexch131210.zip][subtlex-ch] and extracted
 
+[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
+[subtlex-gr]: http://www.bcbl.eu/bcbl-corporativa/wp-content/uploads/2013/01/SUBTLEX-GR_CD.txt
 [subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
 [subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
+[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
 [subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
 
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index e4d95e0..986678c 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -56,10 +56,11 @@ rule convert_leeds
 rule convert_opensubtitles
   command = tr ' ' ',' < $in > $out
 
-# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
-# tabs to commas and commas to nothing, and remove obvious mojibake.
+# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
+# run it through ftfy, convert tabs to commas and spurious CSV formatting to
+# and remove lines with unfixable half-mojibake.
 rule convert_subtlex
-  command = cut -f 1,$col $in | tail -n +2 | tr '	,' ', ' | grep -v 'â,' > $out
+  command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr '	",' ',  ' | grep -v 'â,' > $out
 
 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
index 772b951..5e3de69 100644
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@@ -1,11 +1,12 @@
-from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
+from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist
 import argparse
 
 
 def merge_lists(input_names, output_name):
     count_dicts = []
     for input_name in input_names:
-        count_dicts.append(read_freqs(input_name, cutoff=0))
+        values, total = read_values(input_name, cutoff=0)
+        count_dicts.append(values)
     merged = merge_counts(count_dicts)
     write_wordlist(merged, output_name)
 
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index 044f987..87c575b 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -13,6 +13,7 @@ CONFIG = {
         # 'th' when we get tokenization for it
         # 'hi' when we stop messing up its tokenization
         # 'tl' because it's probably ready right now
+        # 'pl' because we have 3 sources for it
         'twitter': [
             'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
             'pt', 'ru', 'tr'
@@ -38,7 +39,7 @@ CONFIG = {
             # Russian, Spanish, and (Simplified) Chinese.
         ],
         'subtlex-en': ['en'],
-        'subtlex-zh': ['zh'],
+        'subtlex-other': ['de', 'el', 'nl', 'zh'],
     },
     # Subtlex languages that need to be pre-processed
     'wordlist_paths': {
@@ -48,7 +49,7 @@ CONFIG = {
         'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
         'google-books': 'generated/google-books/google_books_{lang}.{ext}',
         'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
-        'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
+        'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
         'combined': 'generated/combined/combined_{lang}.{ext}',
         'combined-dist': 'dist/combined_{lang}.{ext}',
         'twitter-dist': 'dist/twitter_{lang}.{ext}'
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 65773d6..2ae66c4 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -84,9 +84,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
         )
     )
     lines.extend(
-        subtlex_zh_deps(
+        subtlex_other_deps(
             data_filename('source-lists/subtlex'),
-            CONFIG['sources']['subtlex-zh']
+            CONFIG['sources']['subtlex-other']
         )
     )
     lines.extend(combine_lists(all_languages()))
@@ -208,6 +208,17 @@ def opensubtitles_deps(dirname_in, languages):
     return lines
 
 
+# Which columns of the SUBTLEX data files do the word and its frequency appear
+# in?
+SUBTLEX_COLUMN_MAP = {
+    'de': (1, 3),
+    'el': (2, 3),
+    'en': (1, 2),
+    'nl': (1, 2),
+    'zh': (1, 5)
+}
+
+
 def subtlex_en_deps(dirname_in, languages):
     lines = []
     assert languages == ['en']
@@ -217,11 +228,12 @@ def subtlex_en_deps(dirname_in, languages):
         input_file = '{prefix}/subtlex.{region}.txt'.format(
             prefix=dirname_in, region=region
         )
+        textcol, freqcol = SUBTLEX_COLUMN_MAP['en']
         processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
         processed_files.append(processed_file)
         add_dep(
             lines, 'convert_subtlex', input_file, processed_file,
-            params={'col': 2}
+            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
         )
 
     output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
@@ -230,17 +242,25 @@ def subtlex_en_deps(dirname_in, languages):
     return lines
 
 
-def subtlex_zh_deps(dirname_in, languages):
+def subtlex_other_deps(dirname_in, languages):
     lines = []
     for language in languages:
         input_file = '{prefix}/subtlex.{lang}.txt'.format(
             prefix=dirname_in, lang=language
         )
-        processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
-        output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
+        processed_file = wordlist_filename('subtlex-other', language, 'processed.txt')
+        output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
+        textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
+
+        # Greek has three extra header lines for no reason
+        if language == 'el':
+            startrow = 5
+        else:
+            startrow = 2
+
         add_dep(
             lines, 'convert_subtlex', input_file, processed_file,
-            params={'col': 5}
+            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
         )
         add_dep(
             lines, 'merge_counts', processed_file, output_file
@@ -259,7 +279,7 @@ def combine_lists(languages):
         output_file = wordlist_filename('combined', language)
         add_dep(lines, 'merge', input_files, output_file,
                 extra='wordfreq_builder/word_counts.py',
-                params={'cutoff': 2})
+                params={'cutoff': 0})
 
         output_cBpack = wordlist_filename(
             'combined-dist', language, 'msgpack.gz')
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 63d1980..1933295 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -32,19 +32,20 @@ def count_tokens(filename):
     return counts
 
 
-def read_freqs(filename, cutoff=0, lang=None):
+def read_values(filename, cutoff=0, lang=None):
     """
-    Read words and their frequencies from a CSV file.
+    Read words and their frequency or count values from a CSV file. Returns
+    a dictionary of values and the total of all values.
 
-    Only words with a frequency greater than or equal to `cutoff` are returned.
+    Only words with a value greater than or equal to `cutoff` are returned.
 
-    If `cutoff` is greater than 0, the csv file must be sorted by frequency
+    If `cutoff` is greater than 0, the csv file must be sorted by value
     in descending order.
 
-    If lang is given, read_freqs will apply language specific preprocessing
+    If lang is given, it will apply language specific preprocessing
     operations.
     """
-    raw_counts = defaultdict(float)
+    values = defaultdict(float)
     total = 0.
     with open(filename, encoding='utf-8', newline='') as infile:
         for key, strval in csv.reader(infile):
@@ -56,13 +57,29 @@ def read_freqs(filename, cutoff=0, lang=None):
             for token in tokens:
                 # Use += so that, if we give the reader concatenated files with
                 # duplicates, it does the right thing
-                raw_counts[token] += val
+                values[token] += val
                 total += val
+    return values, total
 
-    for word in raw_counts:
-        raw_counts[word] /= total
 
-    return raw_counts
+def read_freqs(filename, cutoff=0, lang=None):
+    """
+    Read words and their frequencies from a CSV file, normalizing the
+    frequencies to add up to 1.
+
+    Only words with a frequency greater than or equal to `cutoff` are returned.
+
+    If `cutoff` is greater than 0, the csv file must be sorted by frequency
+    in descending order.
+
+    If lang is given, read_freqs will apply language specific preprocessing
+    operations.
+    """
+    values, total = read_values(filename, cutoff, lang)
+    for word in values:
+        values[word] /= total
+
+    return values
 
 
 def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):