add twitter-stems-2014 wordlist data

This commit is contained in:
Rob Speer 2015-02-11 13:29:32 -05:00
parent bf0071fd8b
commit 6ab72201cd
3 changed files with 2202970 additions and 1 deletions

View File

@ -189,6 +189,14 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
)
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
logger.info("Loading stemmed Twitter corpus.")
twitter_stems_wordlist = read_multilingual_csv(
os.path.join(source_dir, 'luminoso', 'twitter-stems-2014.csv')
)
for lang in twitter_stems_wordlist:
logger.info("\tLanguage: %s" % lang)
save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang])
logger.info("Done loading.")

View File

@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
or os.path.expanduser('~/.cache/wordfreq'))
# When the minor version number increments, the data may change.
VERSION = '0.4.1'
VERSION = '0.5.0'
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
# Put these options together to make a database filename.

File diff suppressed because it is too large Load Diff