mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 18:18:53 +00:00
Merge branch 'dutch-201504' into ftfy4
Conflicts:
setup.py
Former-commit-id: 24a7c73e6d
This commit is contained in:
commit
d7ea4c420c
@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import codecs
|
import codecs
|
||||||
@ -47,14 +48,15 @@ def _read_csv_basic(filename):
|
|||||||
|
|
||||||
counts = {}
|
counts = {}
|
||||||
for line in infile:
|
for line in infile:
|
||||||
line = line.rstrip(u'\n')
|
if ',' in line:
|
||||||
word, count = line.rsplit(u',', 1)
|
line = line.rstrip('\n')
|
||||||
|
word, count = line.rsplit(',', 1)
|
||||||
count = float(count)
|
count = float(count)
|
||||||
counts[standardize_word(word)] = count
|
counts[standardize_word(word)] = count
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
NUMBER_RE = re.compile(u'[0-9]+')
|
NUMBER_RE = re.compile('[0-9]+')
|
||||||
def read_leeds_corpus(filename):
|
def read_leeds_corpus(filename):
|
||||||
"""
|
"""
|
||||||
Load word frequencies from a "Web as Corpus" file, collected and
|
Load word frequencies from a "Web as Corpus" file, collected and
|
||||||
@ -68,9 +70,9 @@ def read_leeds_corpus(filename):
|
|||||||
for line in infile:
|
for line in infile:
|
||||||
line = line.rstrip()
|
line = line.rstrip()
|
||||||
if line:
|
if line:
|
||||||
rank = line.split(u' ')[0]
|
rank = line.split(' ')[0]
|
||||||
if NUMBER_RE.match(rank) and line.count(u' ') == 2:
|
if NUMBER_RE.match(rank) and line.count(' ') == 2:
|
||||||
_, freq, token = line.split(u' ')
|
_, freq, token = line.split(' ')
|
||||||
token = standardize_word(ftfy(token))
|
token = standardize_word(ftfy(token))
|
||||||
freq = float(freq)
|
freq = float(freq)
|
||||||
counts[token] += freq
|
counts[token] += freq
|
||||||
@ -131,6 +133,24 @@ def get_db_connection(filename):
|
|||||||
return sqlite3.connect(filename)
|
return sqlite3.connect(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def read_leeds_wordlist_into_db(conn, filename, dbname, lang):
|
||||||
|
logger.info("Loading %r" % filename)
|
||||||
|
wordlist = read_leeds_corpus(filename)
|
||||||
|
save_wordlist_to_db(conn, dbname, lang, wordlist)
|
||||||
|
|
||||||
|
|
||||||
|
def read_wordlist_into_db(conn, filename, dbname, lang='*'):
|
||||||
|
logger.info("Loading %r", filename)
|
||||||
|
if lang == '*':
|
||||||
|
multi_wordlist = read_multilingual_csv(filename)
|
||||||
|
for lang in multi_wordlist:
|
||||||
|
logger.info("\tLanguage: %s", lang)
|
||||||
|
save_wordlist_to_db(conn, dbname, lang, multi_wordlist[lang])
|
||||||
|
else:
|
||||||
|
wordlist = read_csv(filename)
|
||||||
|
save_wordlist_to_db(conn, dbname, lang, wordlist)
|
||||||
|
|
||||||
|
|
||||||
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
|
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
|
||||||
def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
||||||
"""
|
"""
|
||||||
@ -157,53 +177,29 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
|||||||
if filename is None:
|
if filename is None:
|
||||||
filename = config.DB_FILENAME
|
filename = config.DB_FILENAME
|
||||||
|
|
||||||
|
def wordlist_path(*pieces):
|
||||||
|
return os.path.join(source_dir, *pieces)
|
||||||
|
|
||||||
logger.info("Creating database")
|
logger.info("Creating database")
|
||||||
conn = create_db(filename)
|
conn = create_db(filename)
|
||||||
|
|
||||||
logger.info("Loading Leeds internet corpus:")
|
|
||||||
for lang in LEEDS_LANGUAGES:
|
for lang in LEEDS_LANGUAGES:
|
||||||
logger.info("\tLanguage: %s" % lang)
|
filename = wordlist_path('leeds', 'internet-%s-forms.num' % lang)
|
||||||
filename = os.path.join(
|
read_leeds_wordlist_into_db(conn, filename, 'leeds-internet', lang)
|
||||||
source_dir, 'leeds', 'internet-%s-forms.num' % lang
|
|
||||||
)
|
|
||||||
wordlist = read_leeds_corpus(filename)
|
|
||||||
save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
|
|
||||||
|
|
||||||
logger.info("Loading Google Books (English).")
|
read_wordlist_into_db(conn, wordlist_path('google', 'google-books-english.csv'), 'google-books', 'en')
|
||||||
google_wordlist = read_csv(
|
read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-52M.csv'), 'twitter', 'xx')
|
||||||
os.path.join(source_dir, 'google', 'google-books-english.csv')
|
read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-stems-2014.csv'), 'twitter-stems', '*')
|
||||||
)
|
read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-surfaces-2014.csv'), 'twitter-surfaces', '*')
|
||||||
save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
|
|
||||||
|
|
||||||
logger.info("Loading combined multilingual corpus:")
|
logger.info("Loading combined multilingual corpus:")
|
||||||
multi_wordlist = read_multilingual_csv(
|
multi_wordlist = read_multilingual_csv(wordlist_path('luminoso', 'multilingual.csv'))
|
||||||
os.path.join(source_dir, 'luminoso', 'multilingual.csv')
|
|
||||||
)
|
|
||||||
for lang in multi_wordlist:
|
for lang in multi_wordlist:
|
||||||
logger.info("\tLanguage: %s" % lang)
|
logger.info("\tLanguage: %s" % lang)
|
||||||
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
|
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
|
||||||
|
|
||||||
logger.info("Loading Twitter corpus.")
|
# Load Dutch from a separate source. We may end up with more languages like this.
|
||||||
twitter_wordlist = read_csv(
|
read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201504.csv'), 'surfaces', '*')
|
||||||
os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
|
|
||||||
)
|
|
||||||
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
|
|
||||||
|
|
||||||
logger.info("Loading stemmed Twitter corpus.")
|
|
||||||
twitter_stems_wordlist = read_multilingual_csv(
|
|
||||||
os.path.join(source_dir, 'luminoso', 'twitter-stems-2014.csv')
|
|
||||||
)
|
|
||||||
for lang in twitter_stems_wordlist:
|
|
||||||
logger.info("\tLanguage: %s" % lang)
|
|
||||||
save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang])
|
|
||||||
|
|
||||||
logger.info("Loading unstemmed Twitter corpus.")
|
|
||||||
twitter_surface_wordlist = read_multilingual_csv(
|
|
||||||
os.path.join(source_dir, 'luminoso', 'twitter-surfaces-2014.csv')
|
|
||||||
)
|
|
||||||
for lang in twitter_surface_wordlist:
|
|
||||||
logger.info("\tLanguage: %s" % lang)
|
|
||||||
save_wordlist_to_db(conn, 'twitter-surfaces', lang, twitter_surface_wordlist[lang])
|
|
||||||
|
|
||||||
logger.info("Done loading.")
|
logger.info("Done loading.")
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
|||||||
or os.path.expanduser('~/.cache/wordfreq'))
|
or os.path.expanduser('~/.cache/wordfreq'))
|
||||||
|
|
||||||
# When the minor version number increments, the data may change.
|
# When the minor version number increments, the data may change.
|
||||||
VERSION = '0.5.0'
|
VERSION = '0.7.0'
|
||||||
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
||||||
|
|
||||||
# Put these options together to make a database filename.
|
# Put these options together to make a database filename.
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
9b29de132c82bd7287c08c2937e3c4821525e356
|
@ -0,0 +1 @@
|
|||||||
|
956c3ff57edf5c45f3e850efd87a30d25c1b4bee
|
21282
wordfreq_data/luminoso/twitter-stems-2014-nl.csv
Normal file
21282
wordfreq_data/luminoso/twitter-stems-2014-nl.csv
Normal file
File diff suppressed because it is too large
Load Diff
23324
wordfreq_data/luminoso/twitter-surfaces-2014-nl.csv
Normal file
23324
wordfreq_data/luminoso/twitter-surfaces-2014-nl.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
|||||||
8ba8230ca42d8e9e622afee772b3a96c34126e23
|
1e9d162c0c1333ce4a9afd79cd8686805f1e19c3
|
1
wordfreq_data/wikipedia/stems-nl.csv.REMOVED.git-id
Normal file
1
wordfreq_data/wikipedia/stems-nl.csv.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
|||||||
|
b9d52d81bbe078a7de17519ed3494eb4771f0f69
|
1
wordfreq_data/wikipedia/surfaces-nl.csv.REMOVED.git-id
Normal file
1
wordfreq_data/wikipedia/surfaces-nl.csv.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
|||||||
|
f69e13f6be1183f69166fe287ada38354ce4de99
|
Loading…
Reference in New Issue
Block a user