mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #2 from LuminosoInsight/new-twitter-lists
New twitter lists
This commit is contained in:
commit
5a4d3a87d5
@ -189,6 +189,22 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
||||
)
|
||||
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
|
||||
|
||||
logger.info("Loading stemmed Twitter corpus.")
|
||||
twitter_stems_wordlist = read_multilingual_csv(
|
||||
os.path.join(source_dir, 'luminoso', 'twitter-stems-2014.csv')
|
||||
)
|
||||
for lang in twitter_stems_wordlist:
|
||||
logger.info("\tLanguage: %s" % lang)
|
||||
save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang])
|
||||
|
||||
logger.info("Loading unstemmed Twitter corpus.")
|
||||
twitter_stems_wordlist = read_multilingual_csv(
|
||||
os.path.join(source_dir, 'luminoso', 'twitter-surfaces-2014.csv')
|
||||
)
|
||||
for lang in twitter_stems_wordlist:
|
||||
logger.info("\tLanguage: %s" % lang)
|
||||
save_wordlist_to_db(conn, 'twitter-surfaces', lang, twitter_stems_wordlist[lang])
|
||||
|
||||
logger.info("Done loading.")
|
||||
|
||||
|
||||
|
@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
||||
or os.path.expanduser('~/.cache/wordfreq'))
|
||||
|
||||
# When the minor version number increments, the data may change.
|
||||
VERSION = '0.4.1'
|
||||
VERSION = '0.5.0'
|
||||
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
||||
|
||||
# Put these options together to make a database filename.
|
||||
|
@ -1,6 +1,5 @@
|
||||
# coding: utf-8
|
||||
from unicodedata import normalize
|
||||
from ftfy.fixes import remove_unsafe_private_use
|
||||
|
||||
|
||||
def standardize_word(word):
|
||||
@ -21,4 +20,4 @@ def standardize_word(word):
|
||||
that the capitalized versions will not share a word count with the
|
||||
lowercase versions.
|
||||
"""
|
||||
return normalize('NFKC', remove_unsafe_private_use(word)).lower()
|
||||
return normalize('NFKC', word).lower()
|
||||
|
2202961
wordfreq_data/luminoso/twitter-stems-2014.csv
Normal file
2202961
wordfreq_data/luminoso/twitter-stems-2014.csv
Normal file
File diff suppressed because it is too large
Load Diff
2370454
wordfreq_data/luminoso/twitter-surfaces-2014.csv
Normal file
2370454
wordfreq_data/luminoso/twitter-surfaces-2014.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user