diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz index 5e5f433..6722a04 100644 Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz index 0cf0cbf..ae7c76d 100644 Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz index 0f69a11..23e029b 100644 Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ diff --git a/wordfreq/data/combined_sv.msgpack.gz b/wordfreq/data/combined_sv.msgpack.gz index 4aa7520..7de7fac 100644 Binary files a/wordfreq/data/combined_sv.msgpack.gz and b/wordfreq/data/combined_sv.msgpack.gz differ diff --git a/wordfreq/data/large_de.msgpack.gz b/wordfreq/data/large_de.msgpack.gz index fb717c2..233cdcd 100644 Binary files a/wordfreq/data/large_de.msgpack.gz and b/wordfreq/data/large_de.msgpack.gz differ diff --git a/wordfreq/data/large_en.msgpack.gz b/wordfreq/data/large_en.msgpack.gz index 8a93bb9..b4eceb4 100644 Binary files a/wordfreq/data/large_en.msgpack.gz and b/wordfreq/data/large_en.msgpack.gz differ diff --git a/wordfreq/data/large_es.msgpack.gz b/wordfreq/data/large_es.msgpack.gz index 0fb0a11..a3d816a 100644 Binary files a/wordfreq/data/large_es.msgpack.gz and b/wordfreq/data/large_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz index 169ea53..725e96e 100644 Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz index 331d651..737eda2 100644 Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz index ecb90e6..dd75ab7 100644 Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_sv.msgpack.gz b/wordfreq/data/twitter_sv.msgpack.gz index cee9c39..0aaf80b 100644 Binary files a/wordfreq/data/twitter_sv.msgpack.gz and b/wordfreq/data/twitter_sv.msgpack.gz differ diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 4eac43b..757b3f4 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -42,10 +42,10 @@ CONFIG = { 'subtlex-other': ['de', 'nl', 'zh'], 'jieba': ['zh'], - # About 99.2% of Reddit is in English, but there are pockets of - # conversation in other languages. These are the languages that seem - # to have enough non-spam comments to actually learn from. - 'reddit': ['de', 'en', 'es', 'sv'] + # About 99.2% of Reddit is in English. There are pockets of + # conversation in other languages, but we're concerned that they're not + # representative enough for learning general word frequencies. + 'reddit': ['en'] }, # Subtlex languages that need to be pre-processed 'wordlist_paths': {