From 5dee417302a45f0d625b099569d46fd643d5792d Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Mon, 18 Aug 2014 14:14:01 -0400 Subject: [PATCH] cleanups to building and uploading, from code review --- wordfreq/build.py | 19 +++++----------- wordfreq/transfer.py | 53 ++++++++++++++++++++++---------------------- 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/wordfreq/build.py b/wordfreq/build.py index 486b559..778b2dc 100644 --- a/wordfreq/build.py +++ b/wordfreq/build.py @@ -37,13 +37,9 @@ def read_multilingual_csv(filename): raw_freqs = _read_csv_basic(filename) for wordlang in raw_freqs: word, lang = wordlang.rsplit('|', 1) - word = standardize_word(word) unscaled[lang][word] = raw_freqs[wordlang] - scaled = {} - for key in unscaled: - scaled[key] = _scale_freqs(unscaled[key]) - return scaled + return {key: _scale_freqs(unscaled[key]) for key in unscaled} def _read_csv_basic(filename): @@ -87,12 +83,8 @@ def _scale_freqs(counts): Take in unscaled word counts or frequencies, and scale them so that they add up to 1.0. """ - freqs = {} total = sum(counts.values()) - for word in counts: - freqs[word] = counts[word] / total - - return freqs + return {word: counts[word] / total for word in counts} def save_wordlist_to_db(conn, listname, lang, freqs): @@ -122,15 +114,17 @@ def create_db(filename): This should be safe to run (and have no effect) if the database already exists. """ - conn = get_db_connection(filename) base_dir = os.path.dirname(filename) if not os.path.exists(base_dir): os.makedirs(base_dir) + conn = get_db_connection(filename) + conn.execute(schema.SCHEMA) for index_definition in schema.INDICES: conn.execute(index_definition) conn.commit() + return conn def get_db_connection(filename): @@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False): filename = config.DB_FILENAME logger.info("Creating database") - create_db(filename) + conn = create_db(filename) - conn = get_db_connection(filename) logger.info("Loading Leeds internet corpus:") for lang in LEEDS_LANGUAGES: logger.info("\tLanguage: %s" % lang) diff --git a/wordfreq/transfer.py b/wordfreq/transfer.py index 0050876..170bf30 100644 --- a/wordfreq/transfer.py +++ b/wordfreq/transfer.py @@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py. from wordfreq import config import os import sys -import shutil -import tempfile import tarfile import logging import subprocess logger = logging.getLogger(__name__) if sys.version_info.major == 2: + PY2 = True from urllib import urlretrieve else: + PY2 = False from urllib.request import urlretrieve @@ -103,36 +103,35 @@ def upload_data(upload_path=None): Collect the raw data and the database file, and upload them to an appropriate directory on the server that hosts downloads. - This requires that it's running in a reasonable Unix environment, - and more notably, that it has the proper SSH keys to upload to that + This requires that it's running in a reasonable Unix environment, on Python + 3, and more notably, that it has the proper SSH keys to upload to that server. """ + from tempfile import TemporaryDirectory + if upload_path is None: upload_path = config.UPLOAD_PATH - build_dir = tempfile.mkdtemp('.wordfreq') - version_dir = os.path.join(build_dir, config.MINOR_VERSION) - os.makedirs(version_dir) + with TemporaryDirectory('.wordfreq') as build_dir: + version_dir = os.path.join(build_dir, config.MINOR_VERSION) + os.makedirs(version_dir) - source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') - logger.info("Creating %s" % source_filename) - with tarfile.open(source_filename, 'w:gz') as tarf: - tarf.add(config.RAW_DATA_DIR) + source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') + logger.info("Creating %s" % source_filename) + with tarfile.open(source_filename, 'w:gz') as tarf: + tarf.add(config.RAW_DATA_DIR) - logger.info("Copying database file %s" % config.DB_FILENAME) - subprocess.call([ - '/bin/cp', - config.DB_FILENAME, - version_dir - ]) + logger.info("Copying database file %s" % config.DB_FILENAME) + subprocess.call([ + '/bin/cp', + config.DB_FILENAME, + version_dir + ]) - logger.info("Uploading to %s" % upload_path) - subprocess.call([ - '/usr/bin/rsync', - '-avz', - version_dir, - upload_path - ]) - - logger.info("Removing build directory %s" % build_dir) - shutil.rmtree(build_dir) + logger.info("Uploading to %s" % upload_path) + subprocess.call([ + '/usr/bin/rsync', + '-avz', + version_dir, + upload_path + ])