mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
limit Reddit data to just English
This commit is contained in:
parent
ced15d6eff
commit
2276d97368
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -42,10 +42,10 @@ CONFIG = {
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
'jieba': ['zh'],
|
||||
|
||||
# About 99.2% of Reddit is in English, but there are pockets of
|
||||
# conversation in other languages. These are the languages that seem
|
||||
# to have enough non-spam comments to actually learn from.
|
||||
'reddit': ['de', 'en', 'es', 'sv']
|
||||
# About 99.2% of Reddit is in English. There are pockets of
|
||||
# conversation in other languages, but we're concerned that they're not
|
||||
# representative enough for learning general word frequencies.
|
||||
'reddit': ['en']
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
|
Loading…
Reference in New Issue
Block a user