mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Use SUBTLEX for German, but OpenSubtitles for Greek
In German and Greek, SUBTLEX and Hermit Dave turn out to have been working from the same source data. I looked at the quality of how they processed the data, and chose SUBTLEX for German, and Dave's wordlist for Greek.
This commit is contained in:
parent
a47497c908
commit
77c60c29b0
@ -129,7 +129,7 @@ at least 3 different sources of word frequencies:
|
||||
──────────────────┼──────────────────────────────────────────────────
|
||||
Arabic ar │ - - Yes Yes Yes Yes
|
||||
German de │ - Yes Yes Yes Yes[1] Yes
|
||||
Greek el │ - Yes Yes Yes Yes Yes
|
||||
Greek el │ - - Yes Yes Yes Yes
|
||||
English en │ Yes Yes Yes Yes Yes Yes
|
||||
Spanish es │ - - Yes Yes Yes Yes
|
||||
French fr │ - - Yes Yes Yes Yes
|
||||
@ -252,6 +252,10 @@ Twitter; it does not display or republish any Twitter content.
|
||||
Methods, 41 (4), 977-990.
|
||||
http://sites.google.com/site/borisnew/pub/BrysbaertNew2009.pdf
|
||||
|
||||
- Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A. M., Bölte, J., & Böhl, A.
|
||||
(2015). The word frequency effect. Experimental Psychology.
|
||||
http://econtent.hogrefe.com/doi/abs/10.1027/1618-3169/a000123?journalCode=zea
|
||||
|
||||
- Cai, Q., & Brysbaert, M. (2010). SUBTLEX-CH: Chinese word and character
|
||||
frequencies based on film subtitles. PLoS One, 5(6), e10729.
|
||||
http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729
|
||||
|
@ -23,9 +23,10 @@ CONFIG = {
|
||||
'pt', 'ru', 'tr'
|
||||
],
|
||||
'opensubtitles': [
|
||||
# All languages where the most common word in OpenSubtitles
|
||||
# appears at least 5000 times
|
||||
'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
|
||||
# This list includes languages where the most common word in
|
||||
# OpenSubtitles appears at least 5000 times. However, we exclude
|
||||
# German, where SUBTLEX has done better processing of the same data.
|
||||
'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'el', 'en', 'es', 'et',
|
||||
'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv',
|
||||
'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq',
|
||||
'sr', 'sv', 'tr', 'uk', 'zh'
|
||||
@ -39,7 +40,7 @@ CONFIG = {
|
||||
# Russian, Spanish, and (Simplified) Chinese.
|
||||
],
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-other': ['de', 'el', 'nl', 'zh'],
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
|
Loading…
Reference in New Issue
Block a user