diff --git a/wordfreq/build.py b/wordfreq/build.py index d5f1609..b52eafc 100644 --- a/wordfreq/build.py +++ b/wordfreq/build.py @@ -36,7 +36,6 @@ def read_multilingual_csv(filename): raw_freqs = _read_csv_basic(filename) for wordlang in raw_freqs: word, lang = wordlang.rsplit('|', 1) - word = standardize_word(word) unscaled[lang][word] = raw_freqs[wordlang] scaled = {} @@ -88,10 +87,7 @@ def _scale_freqs(counts): """ freqs = {} total = sum(counts.values()) - for word in counts: - freqs[word] = counts[word] / total - - return freqs + return {word: count / total for word, count in freqs.items()} def save_wordlist_to_db(conn, listname, lang, freqs): @@ -119,11 +115,11 @@ def create_db(filename): This should be safe to run (and have no effect) if the database already exists. """ - conn = get_db_connection(filename) base_dir = os.path.dirname(filename) if not os.path.exists(base_dir): os.makedirs(base_dir) + conn = get_db_connection(filename) conn.execute(schema.SCHEMA) for index_definition in schema.INDICES: conn.execute(index_definition) diff --git a/wordfreq/config.py b/wordfreq/config.py index 86c225f..e58a9f7 100644 --- a/wordfreq/config.py +++ b/wordfreq/config.py @@ -17,7 +17,7 @@ CACHE_SIZE = 100000 # Where can the data be downloaded from? DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL') or 'http://ferret.lumi/dist/wordfreq/') -RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz') +RAW_DATA_URL = '/'.join([DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz']) DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-%s.db' % MINOR_VERSION) diff --git a/wordfreq/transfer.py b/wordfreq/transfer.py index f5885fa..47d49dd 100644 --- a/wordfreq/transfer.py +++ b/wordfreq/transfer.py @@ -65,7 +65,7 @@ def download(url, dest_filename): tracker = ProgressTracker(url) urlretrieve(url, dest_filename, reporthook=tracker.report_progress) tracker.finish() - logger.info("Saved database to %s" % dest_filename) + logger.info("Saved database to %s", dest_filename) return True @@ -83,7 +83,7 @@ def download_and_extract_raw_data(url=None, root_dir=None): ensure_dir_exists(dest_filename) download(url, dest_filename) - logger.info("Extracting %s" % dest_filename) + logger.info("Extracting %s", dest_filename) with tarfile.open(dest_filename, 'r') as tarf: tarf.extractall(root_dir) @@ -110,33 +110,36 @@ def upload_data(upload_path=None): This requires that it's running in a reasonable Unix environment, and more notably, that it has the proper SSH keys to upload to that server. + + It should also only be run in Python 3, because otherwise you're probably + uploading the wrong data. We can even ensure this by using features that + are specific to Python 3. """ + from tempfile import TemporaryDirectory if upload_path is None: upload_path = config.UPLOAD_PATH - - build_dir = tempfile.mkdtemp('.wordfreq') - version_dir = os.path.join(build_dir, config.MINOR_VERSION) - os.makedirs(version_dir) - source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') - logger.info("Creating %s" % source_filename) - with tarfile.open(source_filename, 'w:gz') as tarf: - tarf.add(config.RAW_DATA_DIR) + with TemporaryDirectory('.wordfreq') as build_tmp: + build_dir = build_tmp.name + version_dir = os.path.join(build_dir, config.MINOR_VERSION) + os.makedirs(version_dir) - logger.info("Copying database file %s" % config.DB_FILENAME) - subprocess.call([ - '/bin/cp', - config.DB_FILENAME, - version_dir - ]) + source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') + logger.info("Creating %s", source_filename) + with tarfile.open(source_filename, 'w:gz') as tarf: + tarf.add(config.RAW_DATA_DIR) - logger.info("Uploading to %s" % upload_path) - subprocess.call([ - '/usr/bin/rsync', - '-avz', - version_dir, - upload_path - ]) + logger.info("Copying database file %s", config.DB_FILENAME) + subprocess.call([ + '/bin/cp', + config.DB_FILENAME, + version_dir + ]) - logger.info("Removing build directory %s" % build_dir) - shutil.rmtree(build_dir) + logger.info("Uploading to %s", upload_path) + subprocess.call([ + '/usr/bin/rsync', + '-avz', + version_dir, + upload_path + ])