mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
v0.7: make a proper Dutch 'surfaces' list
This commit is contained in:
parent
6cf46ee5aa
commit
873ace87db
2
setup.py
2
setup.py
@ -95,7 +95,7 @@ class CustomDevelopCommand(develop):
|
|||||||
self.run_command('download_db')
|
self.run_command('download_db')
|
||||||
|
|
||||||
|
|
||||||
requirements = ['ftfy >= 3']
|
requirements = ['ftfy >= 3, < 4']
|
||||||
if sys.version_info.major == 2:
|
if sys.version_info.major == 2:
|
||||||
requirements.append('functools32')
|
requirements.append('functools32')
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import codecs
|
import codecs
|
||||||
@ -47,14 +48,15 @@ def _read_csv_basic(filename):
|
|||||||
|
|
||||||
counts = {}
|
counts = {}
|
||||||
for line in infile:
|
for line in infile:
|
||||||
line = line.rstrip(u'\n')
|
if ',' in line:
|
||||||
word, count = line.rsplit(u',', 1)
|
line = line.rstrip('\n')
|
||||||
count = float(count)
|
word, count = line.rsplit(',', 1)
|
||||||
counts[standardize_word(word)] = count
|
count = float(count)
|
||||||
|
counts[standardize_word(word)] = count
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
NUMBER_RE = re.compile(u'[0-9]+')
|
NUMBER_RE = re.compile('[0-9]+')
|
||||||
def read_leeds_corpus(filename):
|
def read_leeds_corpus(filename):
|
||||||
"""
|
"""
|
||||||
Load word frequencies from a "Web as Corpus" file, collected and
|
Load word frequencies from a "Web as Corpus" file, collected and
|
||||||
@ -68,9 +70,9 @@ def read_leeds_corpus(filename):
|
|||||||
for line in infile:
|
for line in infile:
|
||||||
line = line.rstrip()
|
line = line.rstrip()
|
||||||
if line:
|
if line:
|
||||||
rank = line.split(u' ')[0]
|
rank = line.split(' ')[0]
|
||||||
if NUMBER_RE.match(rank) and line.count(u' ') == 2:
|
if NUMBER_RE.match(rank) and line.count(' ') == 2:
|
||||||
_, freq, token = line.split(u' ')
|
_, freq, token = line.split(' ')
|
||||||
token = standardize_word(ftfy(token))
|
token = standardize_word(ftfy(token))
|
||||||
freq = float(freq)
|
freq = float(freq)
|
||||||
counts[token] += freq
|
counts[token] += freq
|
||||||
@ -119,7 +121,7 @@ def create_db(filename):
|
|||||||
os.makedirs(base_dir)
|
os.makedirs(base_dir)
|
||||||
|
|
||||||
conn = get_db_connection(filename)
|
conn = get_db_connection(filename)
|
||||||
|
|
||||||
conn.execute(schema.SCHEMA)
|
conn.execute(schema.SCHEMA)
|
||||||
for index_definition in schema.INDICES:
|
for index_definition in schema.INDICES:
|
||||||
conn.execute(index_definition)
|
conn.execute(index_definition)
|
||||||
@ -197,7 +199,7 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
|||||||
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
|
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
|
||||||
|
|
||||||
# Load Dutch from a separate source. We may end up with more languages like this.
|
# Load Dutch from a separate source. We may end up with more languages like this.
|
||||||
read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201503.csv'), 'stems', '*')
|
read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201504.csv'), 'surfaces', '*')
|
||||||
|
|
||||||
logger.info("Done loading.")
|
logger.info("Done loading.")
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
|||||||
or os.path.expanduser('~/.cache/wordfreq'))
|
or os.path.expanduser('~/.cache/wordfreq'))
|
||||||
|
|
||||||
# When the minor version number increments, the data may change.
|
# When the minor version number increments, the data may change.
|
||||||
VERSION = '0.6.0'
|
VERSION = '0.7.0'
|
||||||
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
||||||
|
|
||||||
# Put these options together to make a database filename.
|
# Put these options together to make a database filename.
|
||||||
|
1295646
wordfreq_data/luminoso/nl-combined-201504.csv
Normal file
1295646
wordfreq_data/luminoso/nl-combined-201504.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user