From ae6e03fa06bcda2aebc791742f283746300ed8de Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Fri, 1 Nov 2013 17:05:12 -0400
Subject: [PATCH] code review and pep8 fixes

Former-commit-id: b4b8ba8be7771c183fad83d75b22019d8ee761ca
---
 wordfreq/build.py    |  8 ++-----
 wordfreq/config.py   |  2 +-
 wordfreq/transfer.py | 53 +++++++++++++++++++++++---------------------
 3 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/wordfreq/build.py b/wordfreq/build.py
index d5f1609..b52eafc 100644
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@@ -36,7 +36,6 @@ def read_multilingual_csv(filename):
     raw_freqs = _read_csv_basic(filename)
     for wordlang in raw_freqs:
         word, lang = wordlang.rsplit('|', 1)
-        word = standardize_word(word)
         unscaled[lang][word] = raw_freqs[wordlang]
 
     scaled = {}
@@ -88,10 +87,7 @@ def _scale_freqs(counts):
     """
     freqs = {}
     total = sum(counts.values())
-    for word in counts:
-        freqs[word] = counts[word] / total
-
-    return freqs
+    return {word: count / total for word, count in freqs.items()}
 
 
 def save_wordlist_to_db(conn, listname, lang, freqs):
@@ -119,11 +115,11 @@ def create_db(filename):
     This should be safe to run (and have no effect) if the database already
     exists.
     """
-    conn = get_db_connection(filename)
     base_dir = os.path.dirname(filename)
     if not os.path.exists(base_dir):
         os.makedirs(base_dir)
 
+    conn = get_db_connection(filename)
     conn.execute(schema.SCHEMA)
     for index_definition in schema.INDICES:
         conn.execute(index_definition)
diff --git a/wordfreq/config.py b/wordfreq/config.py
index 86c225f..e58a9f7 100644
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@@ -17,7 +17,7 @@ CACHE_SIZE = 100000
 # Where can the data be downloaded from?
 DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
                 or 'http://ferret.lumi/dist/wordfreq/')
-RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
+RAW_DATA_URL = '/'.join([DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz'])
 DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
                       'wordfreq-%s.db' % MINOR_VERSION)
 
diff --git a/wordfreq/transfer.py b/wordfreq/transfer.py
index f5885fa..47d49dd 100644
--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@@ -65,7 +65,7 @@ def download(url, dest_filename):
     tracker = ProgressTracker(url)
     urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
     tracker.finish()
-    logger.info("Saved database to %s" % dest_filename)
+    logger.info("Saved database to %s", dest_filename)
     return True
 
 
@@ -83,7 +83,7 @@ def download_and_extract_raw_data(url=None, root_dir=None):
     ensure_dir_exists(dest_filename)
     download(url, dest_filename)
 
-    logger.info("Extracting %s" % dest_filename)
+    logger.info("Extracting %s", dest_filename)
     with tarfile.open(dest_filename, 'r') as tarf:
         tarf.extractall(root_dir)
 
@@ -110,33 +110,36 @@ def upload_data(upload_path=None):
     This requires that it's running in a reasonable Unix environment,
     and more notably, that it has the proper SSH keys to upload to that
     server.
+
+    It should also only be run in Python 3, because otherwise you're probably
+    uploading the wrong data. We can even ensure this by using features that
+    are specific to Python 3.
     """
+    from tempfile import TemporaryDirectory
     if upload_path is None:
         upload_path = config.UPLOAD_PATH
-    
-    build_dir = tempfile.mkdtemp('.wordfreq')
-    version_dir = os.path.join(build_dir, config.MINOR_VERSION)
-    os.makedirs(version_dir)
 
-    source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
-    logger.info("Creating %s" % source_filename)
-    with tarfile.open(source_filename, 'w:gz') as tarf:
-        tarf.add(config.RAW_DATA_DIR)
+    with TemporaryDirectory('.wordfreq') as build_tmp:
+        build_dir = build_tmp.name
+        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
+        os.makedirs(version_dir)
 
-    logger.info("Copying database file %s" % config.DB_FILENAME)
-    subprocess.call([
-        '/bin/cp',
-        config.DB_FILENAME,
-        version_dir
-    ])
+        source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
+        logger.info("Creating %s", source_filename)
+        with tarfile.open(source_filename, 'w:gz') as tarf:
+            tarf.add(config.RAW_DATA_DIR)
 
-    logger.info("Uploading to %s" % upload_path)
-    subprocess.call([
-        '/usr/bin/rsync',
-        '-avz',
-        version_dir,
-        upload_path
-    ])
+        logger.info("Copying database file %s", config.DB_FILENAME)
+        subprocess.call([
+            '/bin/cp',
+            config.DB_FILENAME,
+            version_dir
+        ])
 
-    logger.info("Removing build directory %s" % build_dir)
-    shutil.rmtree(build_dir)
+        logger.info("Uploading to %s", upload_path)
+        subprocess.call([
+            '/usr/bin/rsync',
+            '-avz',
+            version_dir,
+            upload_path
+        ])