limit Reddit data to just English

Former-commit-id: 2276d97368
This commit is contained in:
Robyn Speer 2016-04-15 17:01:21 -04:00
parent 5a37cc22c7
commit a0d93e0ce8
12 changed files with 4 additions and 4 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -42,10 +42,10 @@ CONFIG = {
'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh'],
# About 99.2% of Reddit is in English, but there are pockets of
# conversation in other languages. These are the languages that seem
# to have enough non-spam comments to actually learn from.
'reddit': ['de', 'en', 'es', 'sv']
# About 99.2% of Reddit is in English. There are pockets of
# conversation in other languages, but we're concerned that they're not
# representative enough for learning general word frequencies.
'reddit': ['en']
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {