From 25e24f9c328904f6516de833ada6193fa3c34324 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Fri, 4 Sep 2015 01:50:15 -0400 Subject: [PATCH] Note on next languages to support Former-commit-id: 531db642880cbf646edb1a63026b85446c04e78f --- wordfreq_builder/wordfreq_builder/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 142c7ab..044f987 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -8,6 +8,11 @@ CONFIG = { 'sources': { # A list of language codes (possibly un-standardized) that we'll # look up in filenames for these various data sources. + # + # Consider adding: + # 'th' when we get tokenization for it + # 'hi' when we stop messing up its tokenization + # 'tl' because it's probably ready right now 'twitter': [ 'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', 'pt', 'ru', 'tr'