Merge pull request #2 from LuminosoInsight/new-twitter-lists

New twitter lists

Former-commit-id: 5a4d3a87d5
This commit is contained in:
Andrew Lin 2015-02-17 15:36:13 -05:00
commit 39d914f8e1
5 changed files with 20 additions and 3 deletions

View File

@ -189,6 +189,22 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
) )
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist) save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
logger.info("Loading stemmed Twitter corpus.")
twitter_stems_wordlist = read_multilingual_csv(
os.path.join(source_dir, 'luminoso', 'twitter-stems-2014.csv')
)
for lang in twitter_stems_wordlist:
logger.info("\tLanguage: %s" % lang)
save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang])
logger.info("Loading unstemmed Twitter corpus.")
twitter_stems_wordlist = read_multilingual_csv(
os.path.join(source_dir, 'luminoso', 'twitter-surfaces-2014.csv')
)
for lang in twitter_stems_wordlist:
logger.info("\tLanguage: %s" % lang)
save_wordlist_to_db(conn, 'twitter-surfaces', lang, twitter_stems_wordlist[lang])
logger.info("Done loading.") logger.info("Done loading.")

View File

@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
or os.path.expanduser('~/.cache/wordfreq')) or os.path.expanduser('~/.cache/wordfreq'))
# When the minor version number increments, the data may change. # When the minor version number increments, the data may change.
VERSION = '0.4.1' VERSION = '0.5.0'
MINOR_VERSION = '.'.join(VERSION.split('.')[:2]) MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
# Put these options together to make a database filename. # Put these options together to make a database filename.

View File

@ -1,6 +1,5 @@
# coding: utf-8 # coding: utf-8
from unicodedata import normalize from unicodedata import normalize
from ftfy.fixes import remove_unsafe_private_use
def standardize_word(word): def standardize_word(word):
@ -21,4 +20,4 @@ def standardize_word(word):
that the capitalized versions will not share a word count with the that the capitalized versions will not share a word count with the
lowercase versions. lowercase versions.
""" """
return normalize('NFKC', remove_unsafe_private_use(word)).lower() return normalize('NFKC', word).lower()

View File

@ -0,0 +1 @@
3710e65f27753facc699fe56269c9631d5ba6aba

View File

@ -0,0 +1 @@
8ba8230ca42d8e9e622afee772b3a96c34126e23