From 872556f7bb9ad9c017da65ffc549ba39a8661371 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Wed, 9 Sep 2015 13:10:18 -0400
Subject: [PATCH] fixes based on code review notes

Former-commit-id: 354555514f967edb726c60b814f7df1b669fbfb5
---
 README.md                                  | 10 ++++------
 wordfreq_builder/wordfreq_builder/ninja.py |  8 +++-----
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 90d0fbf..6cb3652 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ install them on Ubuntu:
 ## Usage
 
 wordfreq provides access to estimates of the frequency with which a word is
-used, in 15 languages (see *Supported languages* below). It loads
+used, in 16 languages (see *Supported languages* below). It loads
 efficiently-packed data structures that contain all words that appear at least
 once per million words.
 
@@ -122,13 +122,13 @@ of word usage on different topics at different levels of formality. The sources
 - **Twitter**: Messages sampled from Twitter's public stream
 - **Wikipedia**: The full text of Wikipedia in 2015
 
-The following 12 languages are well-supported, with reasonable tokenization and
+The following 14 languages are well-supported, with reasonable tokenization and
 at least 3 different sources of word frequencies:
 
     Language    Code    GBooks  SUBTLEX LeedsIC OpenSub Twitter Wikipedia
     ──────────────────┼──────────────────────────────────────────────────
     Arabic      ar    │ -       -       Yes     Yes     Yes     Yes
-    German      de    │ -       Yes     Yes     Yes     Yes[1]  Yes
+    German      de    │ -       Yes     Yes     -       Yes[1]  Yes
     Greek       el    │ -       -       Yes     Yes     Yes     Yes
     English     en    │ Yes     Yes     Yes     Yes     Yes     Yes
     Spanish     es    │ -       -       Yes     Yes     Yes     Yes
@@ -225,9 +225,7 @@ sources:
 
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
 SUBTLEX-CH, created by Marc Brysbaert et al. and available at
-http://crr.ugent.be/programs-data/subtitle-frequencies. SUBTLEX was first
-published in this paper:
-
+http://crr.ugent.be/programs-data/subtitle-frequencies.
 
 I (Robyn Speer) have
 obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index db0db9f..5b177d4 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -253,11 +253,9 @@ def subtlex_other_deps(dirname_in, languages):
         output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
         textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
 
-        # Greek has three extra header lines for no reason
-        if language == 'el':
-            startrow = 5
-        else:
-            startrow = 2
+        # Skip one header line by setting 'startrow' to 2 (because tail is 1-based).
+        # I hope we don't need to configure this by language anymore.
+        startrow = 2
 
         add_dep(
             lines, 'convert_subtlex', input_file, processed_file,