diff --git a/README.md b/README.md index 5ab4df2..6e894ed 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ install them on Ubuntu: ## Usage wordfreq provides access to estimates of the frequency with which a word is -used, in 15 languages (see *Supported languages* below). It loads +used, in 18 languages (see *Supported languages* below). It loads efficiently-packed data structures that contain all words that appear at least once per million words. @@ -124,7 +124,6 @@ The sources (and the abbreviations we'll use for them) are: Ngrams 2013, and Chinese frequencies from the frequency dictionary that comes with the Jieba tokenizer. - The following 17 languages are well-supported, with reasonable tokenization and at least 3 different sources of word frequencies: @@ -152,7 +151,7 @@ at least 3 different sources of word frequencies: Additionally, Korean is marginally supported. You can look up frequencies in it, but we have too few data sources for it so far: - Language Code SUBTLEX LeedsIC OpenSub Twitter Wpedia + Language Code SUBTLEX OpenSub LeedsIC Twitter Wpedia ──────────────────┼─────────────────────────────────────── Korean ko │ - - - Yes Yes @@ -233,9 +232,15 @@ sources: - Wikipedia, the free encyclopedia (http://www.wikipedia.org) +<<<<<<< HEAD It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and available at http://crr.ugent.be/programs-data/subtitle-frequencies. +======= +It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and +SUBTLEX-CH, created by Marc Brysbaert et al. and available at +http://crr.ugent.be/programs-data/subtitle-frequencies. +>>>>>>> greek-and-turkish I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to distribute these wordlists diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md index b936e39..af47613 100644 --- a/wordfreq_builder/README.md +++ b/wordfreq_builder/README.md @@ -164,7 +164,7 @@ because the word was left as "can". ### SUBTLEX -Mark Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in +Marc Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in wordfreq and derived works without the "academic use" restriction, under the following reasonable conditions: diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 5f9e59b..2c94f58 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -278,21 +278,17 @@ def subtlex_other_deps(dirname_in, languages): output_file = wordlist_filename('subtlex-other', language, 'counts.txt') textcol, freqcol = SUBTLEX_COLUMN_MAP[language] - # Greek has three extra header lines for no reason - if language == 'el': - startrow = 5 - else: - startrow = 2 - if language == 'zh': step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt') add_dep(lines, 'simplify_chinese', input_file, step2_file) else: step2_file = input_file + # Skip one header line by setting 'startrow' to 2 (because tail is 1-based). + # I hope we don't need to configure this by language anymore. add_dep( lines, 'convert_subtlex', step2_file, processed_file, - params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow} + params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} ) add_dep( lines, 'merge_counts', processed_file, output_file