cleanups to building and uploading, from code review

Former-commit-id: 5dee417302
This commit is contained in:
Robyn Speer 2014-08-18 14:14:01 -04:00
parent 759534392f
commit b357ffaa09
2 changed files with 32 additions and 40 deletions

View File

@ -37,13 +37,9 @@ def read_multilingual_csv(filename):
raw_freqs = _read_csv_basic(filename)
for wordlang in raw_freqs:
word, lang = wordlang.rsplit('|', 1)
word = standardize_word(word)
unscaled[lang][word] = raw_freqs[wordlang]
scaled = {}
for key in unscaled:
scaled[key] = _scale_freqs(unscaled[key])
return scaled
return {key: _scale_freqs(unscaled[key]) for key in unscaled}
def _read_csv_basic(filename):
@ -87,12 +83,8 @@ def _scale_freqs(counts):
Take in unscaled word counts or frequencies, and scale them so that
they add up to 1.0.
"""
freqs = {}
total = sum(counts.values())
for word in counts:
freqs[word] = counts[word] / total
return freqs
return {word: counts[word] / total for word in counts}
def save_wordlist_to_db(conn, listname, lang, freqs):
@ -122,15 +114,17 @@ def create_db(filename):
This should be safe to run (and have no effect) if the database already
exists.
"""
conn = get_db_connection(filename)
base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
conn = get_db_connection(filename)
conn.execute(schema.SCHEMA)
for index_definition in schema.INDICES:
conn.execute(index_definition)
conn.commit()
return conn
def get_db_connection(filename):
@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
filename = config.DB_FILENAME
logger.info("Creating database")
create_db(filename)
conn = create_db(filename)
conn = get_db_connection(filename)
logger.info("Loading Leeds internet corpus:")
for lang in LEEDS_LANGUAGES:
logger.info("\tLanguage: %s" % lang)

View File

@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py.
from wordfreq import config
import os
import sys
import shutil
import tempfile
import tarfile
import logging
import subprocess
logger = logging.getLogger(__name__)
if sys.version_info.major == 2:
PY2 = True
from urllib import urlretrieve
else:
PY2 = False
from urllib.request import urlretrieve
@ -103,36 +103,35 @@ def upload_data(upload_path=None):
Collect the raw data and the database file, and upload them to an
appropriate directory on the server that hosts downloads.
This requires that it's running in a reasonable Unix environment,
and more notably, that it has the proper SSH keys to upload to that
This requires that it's running in a reasonable Unix environment, on Python
3, and more notably, that it has the proper SSH keys to upload to that
server.
"""
from tempfile import TemporaryDirectory
if upload_path is None:
upload_path = config.UPLOAD_PATH
build_dir = tempfile.mkdtemp('.wordfreq')
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
with TemporaryDirectory('.wordfreq') as build_dir:
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR)
logger.info("Copying database file %s" % config.DB_FILENAME)
subprocess.call([
'/bin/cp',
config.DB_FILENAME,
version_dir
])
logger.info("Copying database file %s" % config.DB_FILENAME)
subprocess.call([
'/bin/cp',
config.DB_FILENAME,
version_dir
])
logger.info("Uploading to %s" % upload_path)
subprocess.call([
'/usr/bin/rsync',
'-avz',
version_dir,
upload_path
])
logger.info("Removing build directory %s" % build_dir)
shutil.rmtree(build_dir)
logger.info("Uploading to %s" % upload_path)
subprocess.call([
'/usr/bin/rsync',
'-avz',
version_dir,
upload_path
])