From 5dee417302a45f0d625b099569d46fd643d5792d Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Mon, 18 Aug 2014 14:14:01 -0400
Subject: [PATCH] cleanups to building and uploading, from code review

---
 wordfreq/build.py    | 19 +++++-----------
 wordfreq/transfer.py | 53 ++++++++++++++++++++++----------------------
 2 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/wordfreq/build.py b/wordfreq/build.py
index 486b559..778b2dc 100644
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@@ -37,13 +37,9 @@ def read_multilingual_csv(filename):
     raw_freqs = _read_csv_basic(filename)
     for wordlang in raw_freqs:
         word, lang = wordlang.rsplit('|', 1)
-        word = standardize_word(word)
         unscaled[lang][word] = raw_freqs[wordlang]
 
-    scaled = {}
-    for key in unscaled:
-        scaled[key] = _scale_freqs(unscaled[key])
-    return scaled
+    return {key: _scale_freqs(unscaled[key]) for key in unscaled}
 
 
 def _read_csv_basic(filename):
@@ -87,12 +83,8 @@ def _scale_freqs(counts):
     Take in unscaled word counts or frequencies, and scale them so that
     they add up to 1.0.
     """
-    freqs = {}
     total = sum(counts.values())
-    for word in counts:
-        freqs[word] = counts[word] / total
-
-    return freqs
+    return {word: counts[word] / total for word in counts}
 
 
 def save_wordlist_to_db(conn, listname, lang, freqs):
@@ -122,15 +114,17 @@ def create_db(filename):
     This should be safe to run (and have no effect) if the database already
     exists.
     """
-    conn = get_db_connection(filename)
     base_dir = os.path.dirname(filename)
     if not os.path.exists(base_dir):
         os.makedirs(base_dir)
 
+    conn = get_db_connection(filename)
+    
     conn.execute(schema.SCHEMA)
     for index_definition in schema.INDICES:
         conn.execute(index_definition)
     conn.commit()
+    return conn
 
 
 def get_db_connection(filename):
@@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
         filename = config.DB_FILENAME
 
     logger.info("Creating database")
-    create_db(filename)
+    conn = create_db(filename)
 
-    conn = get_db_connection(filename)
     logger.info("Loading Leeds internet corpus:")
     for lang in LEEDS_LANGUAGES:
         logger.info("\tLanguage: %s" % lang)
diff --git a/wordfreq/transfer.py b/wordfreq/transfer.py
index 0050876..170bf30 100644
--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py.
 from wordfreq import config
 import os
 import sys
-import shutil
-import tempfile
 import tarfile
 import logging
 import subprocess
 logger = logging.getLogger(__name__)
 
 if sys.version_info.major == 2:
+    PY2 = True
     from urllib import urlretrieve
 else:
+    PY2 = False
     from urllib.request import urlretrieve
 
 
@@ -103,36 +103,35 @@ def upload_data(upload_path=None):
     Collect the raw data and the database file, and upload them to an
     appropriate directory on the server that hosts downloads.
 
-    This requires that it's running in a reasonable Unix environment,
-    and more notably, that it has the proper SSH keys to upload to that
+    This requires that it's running in a reasonable Unix environment, on Python
+    3, and more notably, that it has the proper SSH keys to upload to that
     server.
     """
+    from tempfile import TemporaryDirectory
+
     if upload_path is None:
         upload_path = config.UPLOAD_PATH
     
-    build_dir = tempfile.mkdtemp('.wordfreq')
-    version_dir = os.path.join(build_dir, config.MINOR_VERSION)
-    os.makedirs(version_dir)
+    with TemporaryDirectory('.wordfreq') as build_dir:
+        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
+        os.makedirs(version_dir)
 
-    source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
-    logger.info("Creating %s" % source_filename)
-    with tarfile.open(source_filename, 'w:gz') as tarf:
-        tarf.add(config.RAW_DATA_DIR)
+        source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
+        logger.info("Creating %s" % source_filename)
+        with tarfile.open(source_filename, 'w:gz') as tarf:
+            tarf.add(config.RAW_DATA_DIR)
 
-    logger.info("Copying database file %s" % config.DB_FILENAME)
-    subprocess.call([
-        '/bin/cp',
-        config.DB_FILENAME,
-        version_dir
-    ])
+        logger.info("Copying database file %s" % config.DB_FILENAME)
+        subprocess.call([
+            '/bin/cp',
+            config.DB_FILENAME,
+            version_dir
+        ])
 
-    logger.info("Uploading to %s" % upload_path)
-    subprocess.call([
-        '/usr/bin/rsync',
-        '-avz',
-        version_dir,
-        upload_path
-    ])
-
-    logger.info("Removing build directory %s" % build_dir)
-    shutil.rmtree(build_dir)
+        logger.info("Uploading to %s" % upload_path)
+        subprocess.call([
+            '/usr/bin/rsync',
+            '-avz',
+            version_dir,
+            upload_path
+        ])