From 9c08442dc5f57236105fdfa63147b038057ae29a Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 9 Sep 2015 13:10:18 -0400 Subject: [PATCH] fixes based on code review notes Former-commit-id: 354555514f967edb726c60b814f7df1b669fbfb5 --- README.md | 10 ++++------ wordfreq_builder/wordfreq_builder/ninja.py | 8 +++----- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4502d39..0bba163 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ install them on Ubuntu: ## Usage wordfreq provides access to estimates of the frequency with which a word is -used, in 15 languages (see *Supported languages* below). It loads +used, in 16 languages (see *Supported languages* below). It loads efficiently-packed data structures that contain all words that appear at least once per million words. @@ -122,13 +122,13 @@ of word usage on different topics at different levels of formality. The sources - **Twitter**: Messages sampled from Twitter's public stream - **Wikipedia**: The full text of Wikipedia in 2015 -The following 12 languages are well-supported, with reasonable tokenization and +The following 14 languages are well-supported, with reasonable tokenization and at least 3 different sources of word frequencies: Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia ──────────────────┼────────────────────────────────────────────────── Arabic ar │ - - Yes Yes Yes Yes - German de │ - Yes Yes Yes Yes[1] Yes + German de │ - Yes Yes - Yes[1] Yes Greek el │ - - Yes Yes Yes Yes English en │ Yes Yes Yes Yes Yes Yes Spanish es │ - - Yes Yes Yes Yes @@ -225,9 +225,7 @@ sources: It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and SUBTLEX-CH, created by Marc Brysbaert et al. and available at -http://crr.ugent.be/programs-data/subtitle-frequencies. SUBTLEX was first -published in this paper: - +http://crr.ugent.be/programs-data/subtitle-frequencies. I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to distribute these wordlists diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index db0db9f..5b177d4 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -253,11 +253,9 @@ def subtlex_other_deps(dirname_in, languages): output_file = wordlist_filename('subtlex-other', language, 'counts.txt') textcol, freqcol = SUBTLEX_COLUMN_MAP[language] - # Greek has three extra header lines for no reason - if language == 'el': - startrow = 5 - else: - startrow = 2 + # Skip one header line by setting 'startrow' to 2 (because tail is 1-based). + # I hope we don't need to configure this by language anymore. + startrow = 2 add_dep( lines, 'convert_subtlex', input_file, processed_file,