From 8277b34571e75983c54f04a65d7c22aaec50d03f Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Fri, 4 Sep 2015 15:52:21 -0400 Subject: [PATCH] Use SUBTLEX for German, but OpenSubtitles for Greek In German and Greek, SUBTLEX and Hermit Dave turn out to have been working from the same source data. I looked at the quality of how they processed the data, and chose SUBTLEX for German, and Dave's wordlist for Greek. Former-commit-id: 77c60c29b09b8d712d935093ada8138761472c9e --- README.md | 6 +++++- wordfreq_builder/wordfreq_builder/config.py | 9 +++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e56c6ea..35670b9 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ at least 3 different sources of word frequencies: ──────────────────┼────────────────────────────────────────────────── Arabic ar │ - - Yes Yes Yes Yes German de │ - Yes Yes Yes Yes[1] Yes - Greek el │ - Yes Yes Yes Yes Yes + Greek el │ - - Yes Yes Yes Yes English en │ Yes Yes Yes Yes Yes Yes Spanish es │ - - Yes Yes Yes Yes French fr │ - - Yes Yes Yes Yes @@ -252,6 +252,10 @@ Twitter; it does not display or republish any Twitter content. Methods, 41 (4), 977-990. http://sites.google.com/site/borisnew/pub/BrysbaertNew2009.pdf +- Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A. M., Bölte, J., & Böhl, A. + (2015). The word frequency effect. Experimental Psychology. + http://econtent.hogrefe.com/doi/abs/10.1027/1618-3169/a000123?journalCode=zea + - Cai, Q., & Brysbaert, M. (2010). SUBTLEX-CH: Chinese word and character frequencies based on film subtitles. PLoS One, 5(6), e10729. http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729 diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 87c575b..dc61bc6 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -23,9 +23,10 @@ CONFIG = { 'pt', 'ru', 'tr' ], 'opensubtitles': [ - # All languages where the most common word in OpenSubtitles - # appears at least 5000 times - 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', + # This list includes languages where the most common word in + # OpenSubtitles appears at least 5000 times. However, we exclude + # German, where SUBTLEX has done better processing of the same data. + 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'el', 'en', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv', 'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'tr', 'uk', 'zh' @@ -39,7 +40,7 @@ CONFIG = { # Russian, Spanish, and (Simplified) Chinese. ], 'subtlex-en': ['en'], - 'subtlex-other': ['de', 'el', 'nl', 'zh'], + 'subtlex-other': ['de', 'nl', 'zh'], }, # Subtlex languages that need to be pre-processed 'wordlist_paths': {