diff --git a/wordfreq/build.py b/wordfreq/build.py index 778b2dc..d570117 100644 --- a/wordfreq/build.py +++ b/wordfreq/build.py @@ -189,6 +189,14 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False): ) save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist) + logger.info("Loading stemmed Twitter corpus.") + twitter_stems_wordlist = read_multilingual_csv( + os.path.join(source_dir, 'luminoso', 'twitter-stems-2014.csv') + ) + for lang in twitter_stems_wordlist: + logger.info("\tLanguage: %s" % lang) + save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang]) + logger.info("Done loading.") diff --git a/wordfreq/config.py b/wordfreq/config.py index 3c780aa..558acf3 100644 --- a/wordfreq/config.py +++ b/wordfreq/config.py @@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA') or os.path.expanduser('~/.cache/wordfreq')) # When the minor version number increments, the data may change. -VERSION = '0.4.1' +VERSION = '0.5.0' MINOR_VERSION = '.'.join(VERSION.split('.')[:2]) # Put these options together to make a database filename. diff --git a/wordfreq_data/luminoso/twitter-stems-2014.csv.REMOVED.git-id b/wordfreq_data/luminoso/twitter-stems-2014.csv.REMOVED.git-id new file mode 100644 index 0000000..24306da --- /dev/null +++ b/wordfreq_data/luminoso/twitter-stems-2014.csv.REMOVED.git-id @@ -0,0 +1 @@ +3710e65f27753facc699fe56269c9631d5ba6aba \ No newline at end of file