cleanups to building and uploading, from code review

This commit is contained in:
Rob Speer 2014-08-18 14:14:01 -04:00
parent cb7b2b76e6
commit 5dee417302
2 changed files with 32 additions and 40 deletions

View File

@ -37,13 +37,9 @@ def read_multilingual_csv(filename):
raw_freqs = _read_csv_basic(filename) raw_freqs = _read_csv_basic(filename)
for wordlang in raw_freqs: for wordlang in raw_freqs:
word, lang = wordlang.rsplit('|', 1) word, lang = wordlang.rsplit('|', 1)
word = standardize_word(word)
unscaled[lang][word] = raw_freqs[wordlang] unscaled[lang][word] = raw_freqs[wordlang]
scaled = {} return {key: _scale_freqs(unscaled[key]) for key in unscaled}
for key in unscaled:
scaled[key] = _scale_freqs(unscaled[key])
return scaled
def _read_csv_basic(filename): def _read_csv_basic(filename):
@ -87,12 +83,8 @@ def _scale_freqs(counts):
Take in unscaled word counts or frequencies, and scale them so that Take in unscaled word counts or frequencies, and scale them so that
they add up to 1.0. they add up to 1.0.
""" """
freqs = {}
total = sum(counts.values()) total = sum(counts.values())
for word in counts: return {word: counts[word] / total for word in counts}
freqs[word] = counts[word] / total
return freqs
def save_wordlist_to_db(conn, listname, lang, freqs): def save_wordlist_to_db(conn, listname, lang, freqs):
@ -122,15 +114,17 @@ def create_db(filename):
This should be safe to run (and have no effect) if the database already This should be safe to run (and have no effect) if the database already
exists. exists.
""" """
conn = get_db_connection(filename)
base_dir = os.path.dirname(filename) base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir): if not os.path.exists(base_dir):
os.makedirs(base_dir) os.makedirs(base_dir)
conn = get_db_connection(filename)
conn.execute(schema.SCHEMA) conn.execute(schema.SCHEMA)
for index_definition in schema.INDICES: for index_definition in schema.INDICES:
conn.execute(index_definition) conn.execute(index_definition)
conn.commit() conn.commit()
return conn
def get_db_connection(filename): def get_db_connection(filename):
@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
filename = config.DB_FILENAME filename = config.DB_FILENAME
logger.info("Creating database") logger.info("Creating database")
create_db(filename) conn = create_db(filename)
conn = get_db_connection(filename)
logger.info("Loading Leeds internet corpus:") logger.info("Loading Leeds internet corpus:")
for lang in LEEDS_LANGUAGES: for lang in LEEDS_LANGUAGES:
logger.info("\tLanguage: %s" % lang) logger.info("\tLanguage: %s" % lang)

View File

@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py.
from wordfreq import config from wordfreq import config
import os import os
import sys import sys
import shutil
import tempfile
import tarfile import tarfile
import logging import logging
import subprocess import subprocess
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
if sys.version_info.major == 2: if sys.version_info.major == 2:
PY2 = True
from urllib import urlretrieve from urllib import urlretrieve
else: else:
PY2 = False
from urllib.request import urlretrieve from urllib.request import urlretrieve
@ -103,36 +103,35 @@ def upload_data(upload_path=None):
Collect the raw data and the database file, and upload them to an Collect the raw data and the database file, and upload them to an
appropriate directory on the server that hosts downloads. appropriate directory on the server that hosts downloads.
This requires that it's running in a reasonable Unix environment, This requires that it's running in a reasonable Unix environment, on Python
and more notably, that it has the proper SSH keys to upload to that 3, and more notably, that it has the proper SSH keys to upload to that
server. server.
""" """
from tempfile import TemporaryDirectory
if upload_path is None: if upload_path is None:
upload_path = config.UPLOAD_PATH upload_path = config.UPLOAD_PATH
build_dir = tempfile.mkdtemp('.wordfreq') with TemporaryDirectory('.wordfreq') as build_dir:
version_dir = os.path.join(build_dir, config.MINOR_VERSION) version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir) os.makedirs(version_dir)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename) logger.info("Creating %s" % source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf: with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR) tarf.add(config.RAW_DATA_DIR)
logger.info("Copying database file %s" % config.DB_FILENAME) logger.info("Copying database file %s" % config.DB_FILENAME)
subprocess.call([ subprocess.call([
'/bin/cp', '/bin/cp',
config.DB_FILENAME, config.DB_FILENAME,
version_dir version_dir
]) ])
logger.info("Uploading to %s" % upload_path) logger.info("Uploading to %s" % upload_path)
subprocess.call([ subprocess.call([
'/usr/bin/rsync', '/usr/bin/rsync',
'-avz', '-avz',
version_dir, version_dir,
upload_path upload_path
]) ])
logger.info("Removing build directory %s" % build_dir)
shutil.rmtree(build_dir)