cleanups to building and uploading, from code review

Former-commit-id: 5dee417302
2024-12-23 09:21:37 +00:00 · 2014-08-18 14:14:01 -04:00 · 2014-08-18 14:14:01 -04:00 · b357ffaa09
commit b357ffaa09
parent 759534392f
2 changed files with 32 additions and 40 deletions
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -37,13 +37,9 @@ def read_multilingual_csv(filename):
    raw_freqs = _read_csv_basic(filename)
    for wordlang in raw_freqs:
        word, lang = wordlang.rsplit('|', 1)
-        word = standardize_word(word)
        unscaled[lang][word] = raw_freqs[wordlang]

-    scaled = {}
-    for key in unscaled:
-        scaled[key] = _scale_freqs(unscaled[key])
-    return scaled
+    return {key: _scale_freqs(unscaled[key]) for key in unscaled}


 def _read_csv_basic(filename):
@ -87,12 +83,8 @@ def _scale_freqs(counts):
    Take in unscaled word counts or frequencies, and scale them so that
    they add up to 1.0.
    """
-    freqs = {}
    total = sum(counts.values())
-    for word in counts:
-        freqs[word] = counts[word] / total
-
-    return freqs
+    return {word: counts[word] / total for word in counts}


 def save_wordlist_to_db(conn, listname, lang, freqs):
@ -122,15 +114,17 @@ def create_db(filename):
    This should be safe to run (and have no effect) if the database already
    exists.
    """
-    conn = get_db_connection(filename)
    base_dir = os.path.dirname(filename)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

+    conn = get_db_connection(filename)
+    
    conn.execute(schema.SCHEMA)
    for index_definition in schema.INDICES:
        conn.execute(index_definition)
    conn.commit()
+    return conn


 def get_db_connection(filename):
@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
        filename = config.DB_FILENAME

    logger.info("Creating database")
-    create_db(filename)
+    conn = create_db(filename)

-    conn = get_db_connection(filename)
    logger.info("Loading Leeds internet corpus:")
    for lang in LEEDS_LANGUAGES:
        logger.info("\tLanguage: %s" % lang)
--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py.
 from wordfreq import config
 import os
 import sys
-import shutil
-import tempfile
 import tarfile
 import logging
 import subprocess
 logger = logging.getLogger(__name__)

 if sys.version_info.major == 2:
+    PY2 = True
    from urllib import urlretrieve
 else:
+    PY2 = False
    from urllib.request import urlretrieve


@ -103,36 +103,35 @@ def upload_data(upload_path=None):
    Collect the raw data and the database file, and upload them to an
    appropriate directory on the server that hosts downloads.

-    This requires that it's running in a reasonable Unix environment,
-    and more notably, that it has the proper SSH keys to upload to that
+    This requires that it's running in a reasonable Unix environment, on Python
+    3, and more notably, that it has the proper SSH keys to upload to that
    server.
    """
+    from tempfile import TemporaryDirectory
+
    if upload_path is None:
        upload_path = config.UPLOAD_PATH
    
-    build_dir = tempfile.mkdtemp('.wordfreq')
-    version_dir = os.path.join(build_dir, config.MINOR_VERSION)
-    os.makedirs(version_dir)
+    with TemporaryDirectory('.wordfreq') as build_dir:
+        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
+        os.makedirs(version_dir)

-    source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
-    logger.info("Creating %s" % source_filename)
-    with tarfile.open(source_filename, 'w:gz') as tarf:
-        tarf.add(config.RAW_DATA_DIR)
+        source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
+        logger.info("Creating %s" % source_filename)
+        with tarfile.open(source_filename, 'w:gz') as tarf:
+            tarf.add(config.RAW_DATA_DIR)

-    logger.info("Copying database file %s" % config.DB_FILENAME)
-    subprocess.call([
-        '/bin/cp',
-        config.DB_FILENAME,
-        version_dir
-    ])
+        logger.info("Copying database file %s" % config.DB_FILENAME)
+        subprocess.call([
+            '/bin/cp',
+            config.DB_FILENAME,
+            version_dir
+        ])

-    logger.info("Uploading to %s" % upload_path)
-    subprocess.call([
-        '/usr/bin/rsync',
-        '-avz',
-        version_dir,
-        upload_path
-    ])
-
-    logger.info("Removing build directory %s" % build_dir)
-    shutil.rmtree(build_dir)
+        logger.info("Uploading to %s" % upload_path)
+        subprocess.call([
+            '/usr/bin/rsync',
+            '-avz',
+            version_dir,
+            upload_path
+        ])