cleanups to building and uploading, from code review

2024-12-23 17:31:41 +00:00 · 2014-08-18 14:14:01 -04:00 · 2014-08-18 14:14:01 -04:00 · 5dee417302
commit 5dee417302
parent cb7b2b76e6
2 changed files with 32 additions and 40 deletions
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -37,13 +37,9 @@ def read_multilingual_csv(filename):
    raw_freqs = _read_csv_basic(filename)
    for wordlang in raw_freqs:
        word, lang = wordlang.rsplit('|', 1)
-        word = standardize_word(word)
        unscaled[lang][word] = raw_freqs[wordlang]

-    scaled = {}
-    for key in unscaled:
-        scaled[key] = _scale_freqs(unscaled[key])
-    return scaled
+    return {key: _scale_freqs(unscaled[key]) for key in unscaled}


 def _read_csv_basic(filename):
@ -87,12 +83,8 @@ def _scale_freqs(counts):
    Take in unscaled word counts or frequencies, and scale them so that
    they add up to 1.0.
    """
-    freqs = {}
    total = sum(counts.values())
-    for word in counts:
-        freqs[word] = counts[word] / total
-
-    return freqs
+    return {word: counts[word] / total for word in counts}


 def save_wordlist_to_db(conn, listname, lang, freqs):
@ -122,15 +114,17 @@ def create_db(filename):
    This should be safe to run (and have no effect) if the database already
    exists.
    """
-    conn = get_db_connection(filename)
    base_dir = os.path.dirname(filename)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

+    conn = get_db_connection(filename)
+    
    conn.execute(schema.SCHEMA)
    for index_definition in schema.INDICES:
        conn.execute(index_definition)
    conn.commit()
+    return conn


 def get_db_connection(filename):
@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
        filename = config.DB_FILENAME

    logger.info("Creating database")
-    create_db(filename)
+    conn = create_db(filename)

-    conn = get_db_connection(filename)
    logger.info("Loading Leeds internet corpus:")
    for lang in LEEDS_LANGUAGES:
        logger.info("\tLanguage: %s" % lang)
--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py.
 from wordfreq import config
 import os
 import sys
-import shutil
-import tempfile
 import tarfile
 import logging
 import subprocess
 logger = logging.getLogger(__name__)

 if sys.version_info.major == 2:
+    PY2 = True
    from urllib import urlretrieve
 else:
+    PY2 = False
    from urllib.request import urlretrieve


@ -103,14 +103,16 @@ def upload_data(upload_path=None):
    Collect the raw data and the database file, and upload them to an
    appropriate directory on the server that hosts downloads.

-    This requires that it's running in a reasonable Unix environment,
-    and more notably, that it has the proper SSH keys to upload to that
+    This requires that it's running in a reasonable Unix environment, on Python
+    3, and more notably, that it has the proper SSH keys to upload to that
    server.
    """
+    from tempfile import TemporaryDirectory
+
    if upload_path is None:
        upload_path = config.UPLOAD_PATH
    
-    build_dir = tempfile.mkdtemp('.wordfreq')
+    with TemporaryDirectory('.wordfreq') as build_dir:
        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
        os.makedirs(version_dir)

@ -133,6 +135,3 @@ def upload_data(upload_path=None):
            version_dir,
            upload_path
        ])
-
-    logger.info("Removing build directory %s" % build_dir)
-    shutil.rmtree(build_dir)