add surface forms from Twitter 2014 data

This commit is contained in:
Rob Speer 2015-02-17 15:06:11 -05:00
parent b6f246ecbb
commit ffdaa82b11
2 changed files with 2370462 additions and 0 deletions

View File

@ -197,6 +197,14 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
logger.info("\tLanguage: %s" % lang)
save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang])
logger.info("Loading unstemmed Twitter corpus.")
twitter_stems_wordlist = read_multilingual_csv(
os.path.join(source_dir, 'luminoso', 'twitter-surfaces-2014.csv')
)
for lang in twitter_stems_wordlist:
logger.info("\tLanguage: %s" % lang)
save_wordlist_to_db(conn, 'twitter-surfaces', lang, twitter_stems_wordlist[lang])
logger.info("Done loading.")

File diff suppressed because it is too large Load Diff