code review and pep8 fixes

Former-commit-id: b4b8ba8be7
2024-12-23 17:31:41 +00:00 · 2013-11-01 17:05:12 -04:00 · 2013-11-01 17:05:12 -04:00 · ae6e03fa06
commit ae6e03fa06
parent 5168da105a
3 changed files with 31 additions and 32 deletions
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -36,7 +36,6 @@ def read_multilingual_csv(filename):
    raw_freqs = _read_csv_basic(filename)
    for wordlang in raw_freqs:
        word, lang = wordlang.rsplit('|', 1)
-        word = standardize_word(word)
        unscaled[lang][word] = raw_freqs[wordlang]

    scaled = {}
@ -88,10 +87,7 @@ def _scale_freqs(counts):
    """
    freqs = {}
    total = sum(counts.values())
-    for word in counts:
-        freqs[word] = counts[word] / total
-
-    return freqs
+    return {word: count / total for word, count in freqs.items()}


 def save_wordlist_to_db(conn, listname, lang, freqs):
@ -119,11 +115,11 @@ def create_db(filename):
    This should be safe to run (and have no effect) if the database already
    exists.
    """
-    conn = get_db_connection(filename)
    base_dir = os.path.dirname(filename)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

+    conn = get_db_connection(filename)
    conn.execute(schema.SCHEMA)
    for index_definition in schema.INDICES:
        conn.execute(index_definition)
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@ -17,7 +17,7 @@ CACHE_SIZE = 100000
 # Where can the data be downloaded from?
 DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
                or 'http://ferret.lumi/dist/wordfreq/')
-RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
+RAW_DATA_URL = '/'.join([DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz'])
 DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
                      'wordfreq-%s.db' % MINOR_VERSION)

--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@ -65,7 +65,7 @@ def download(url, dest_filename):
    tracker = ProgressTracker(url)
    urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
    tracker.finish()
-    logger.info("Saved database to %s" % dest_filename)
+    logger.info("Saved database to %s", dest_filename)
    return True


@ -83,7 +83,7 @@ def download_and_extract_raw_data(url=None, root_dir=None):
    ensure_dir_exists(dest_filename)
    download(url, dest_filename)

-    logger.info("Extracting %s" % dest_filename)
+    logger.info("Extracting %s", dest_filename)
    with tarfile.open(dest_filename, 'r') as tarf:
        tarf.extractall(root_dir)

@ -110,33 +110,36 @@ def upload_data(upload_path=None):
    This requires that it's running in a reasonable Unix environment,
    and more notably, that it has the proper SSH keys to upload to that
    server.
+
+    It should also only be run in Python 3, because otherwise you're probably
+    uploading the wrong data. We can even ensure this by using features that
+    are specific to Python 3.
    """
+    from tempfile import TemporaryDirectory
    if upload_path is None:
        upload_path = config.UPLOAD_PATH
-    
-    build_dir = tempfile.mkdtemp('.wordfreq')
-    version_dir = os.path.join(build_dir, config.MINOR_VERSION)
-    os.makedirs(version_dir)

-    source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
-    logger.info("Creating %s" % source_filename)
-    with tarfile.open(source_filename, 'w:gz') as tarf:
-        tarf.add(config.RAW_DATA_DIR)
+    with TemporaryDirectory('.wordfreq') as build_tmp:
+        build_dir = build_tmp.name
+        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
+        os.makedirs(version_dir)

-    logger.info("Copying database file %s" % config.DB_FILENAME)
-    subprocess.call([
-        '/bin/cp',
-        config.DB_FILENAME,
-        version_dir
-    ])
+        source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
+        logger.info("Creating %s", source_filename)
+        with tarfile.open(source_filename, 'w:gz') as tarf:
+            tarf.add(config.RAW_DATA_DIR)

-    logger.info("Uploading to %s" % upload_path)
-    subprocess.call([
-        '/usr/bin/rsync',
-        '-avz',
-        version_dir,
-        upload_path
-    ])
+        logger.info("Copying database file %s", config.DB_FILENAME)
+        subprocess.call([
+            '/bin/cp',
+            config.DB_FILENAME,
+            version_dir
+        ])

-    logger.info("Removing build directory %s" % build_dir)
-    shutil.rmtree(build_dir)
+        logger.info("Uploading to %s", upload_path)
+        subprocess.call([
+            '/usr/bin/rsync',
+            '-avz',
+            version_dir,
+            upload_path
+        ])