v0.7: make a proper Dutch 'surfaces' list

This commit is contained in:
Rob Speer 2015-04-30 13:01:24 -04:00
parent 6cf46ee5aa
commit 873ace87db
4 changed files with 1295660 additions and 12 deletions

View File

@ -95,7 +95,7 @@ class CustomDevelopCommand(develop):
self.run_command('download_db') self.run_command('download_db')
requirements = ['ftfy >= 3'] requirements = ['ftfy >= 3, < 4']
if sys.version_info.major == 2: if sys.version_info.major == 2:
requirements.append('functools32') requirements.append('functools32')

View File

@ -1,3 +1,4 @@
from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
import sqlite3 import sqlite3
import codecs import codecs
@ -47,14 +48,15 @@ def _read_csv_basic(filename):
counts = {} counts = {}
for line in infile: for line in infile:
line = line.rstrip(u'\n') if ',' in line:
word, count = line.rsplit(u',', 1) line = line.rstrip('\n')
count = float(count) word, count = line.rsplit(',', 1)
counts[standardize_word(word)] = count count = float(count)
counts[standardize_word(word)] = count
return counts return counts
NUMBER_RE = re.compile(u'[0-9]+') NUMBER_RE = re.compile('[0-9]+')
def read_leeds_corpus(filename): def read_leeds_corpus(filename):
""" """
Load word frequencies from a "Web as Corpus" file, collected and Load word frequencies from a "Web as Corpus" file, collected and
@ -68,9 +70,9 @@ def read_leeds_corpus(filename):
for line in infile: for line in infile:
line = line.rstrip() line = line.rstrip()
if line: if line:
rank = line.split(u' ')[0] rank = line.split(' ')[0]
if NUMBER_RE.match(rank) and line.count(u' ') == 2: if NUMBER_RE.match(rank) and line.count(' ') == 2:
_, freq, token = line.split(u' ') _, freq, token = line.split(' ')
token = standardize_word(ftfy(token)) token = standardize_word(ftfy(token))
freq = float(freq) freq = float(freq)
counts[token] += freq counts[token] += freq
@ -119,7 +121,7 @@ def create_db(filename):
os.makedirs(base_dir) os.makedirs(base_dir)
conn = get_db_connection(filename) conn = get_db_connection(filename)
conn.execute(schema.SCHEMA) conn.execute(schema.SCHEMA)
for index_definition in schema.INDICES: for index_definition in schema.INDICES:
conn.execute(index_definition) conn.execute(index_definition)
@ -197,7 +199,7 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang]) save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
# Load Dutch from a separate source. We may end up with more languages like this. # Load Dutch from a separate source. We may end up with more languages like this.
read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201503.csv'), 'stems', '*') read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201504.csv'), 'surfaces', '*')
logger.info("Done loading.") logger.info("Done loading.")

View File

@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
or os.path.expanduser('~/.cache/wordfreq')) or os.path.expanduser('~/.cache/wordfreq'))
# When the minor version number increments, the data may change. # When the minor version number increments, the data may change.
VERSION = '0.6.0' VERSION = '0.7.0'
MINOR_VERSION = '.'.join(VERSION.split('.')[:2]) MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
# Put these options together to make a database filename. # Put these options together to make a database filename.

File diff suppressed because it is too large Load Diff