code review and pep8 fixes

Former-commit-id: b4b8ba8be7
This commit is contained in:
Robyn Speer 2013-11-01 17:05:12 -04:00
parent 5168da105a
commit ae6e03fa06
3 changed files with 31 additions and 32 deletions

View File

@ -36,7 +36,6 @@ def read_multilingual_csv(filename):
raw_freqs = _read_csv_basic(filename)
for wordlang in raw_freqs:
word, lang = wordlang.rsplit('|', 1)
word = standardize_word(word)
unscaled[lang][word] = raw_freqs[wordlang]
scaled = {}
@ -88,10 +87,7 @@ def _scale_freqs(counts):
"""
freqs = {}
total = sum(counts.values())
for word in counts:
freqs[word] = counts[word] / total
return freqs
return {word: count / total for word, count in freqs.items()}
def save_wordlist_to_db(conn, listname, lang, freqs):
@ -119,11 +115,11 @@ def create_db(filename):
This should be safe to run (and have no effect) if the database already
exists.
"""
conn = get_db_connection(filename)
base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
conn = get_db_connection(filename)
conn.execute(schema.SCHEMA)
for index_definition in schema.INDICES:
conn.execute(index_definition)

View File

@ -17,7 +17,7 @@ CACHE_SIZE = 100000
# Where can the data be downloaded from?
DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
or 'http://ferret.lumi/dist/wordfreq/')
RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
RAW_DATA_URL = '/'.join([DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz'])
DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
'wordfreq-%s.db' % MINOR_VERSION)

View File

@ -65,7 +65,7 @@ def download(url, dest_filename):
tracker = ProgressTracker(url)
urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
tracker.finish()
logger.info("Saved database to %s" % dest_filename)
logger.info("Saved database to %s", dest_filename)
return True
@ -83,7 +83,7 @@ def download_and_extract_raw_data(url=None, root_dir=None):
ensure_dir_exists(dest_filename)
download(url, dest_filename)
logger.info("Extracting %s" % dest_filename)
logger.info("Extracting %s", dest_filename)
with tarfile.open(dest_filename, 'r') as tarf:
tarf.extractall(root_dir)
@ -110,33 +110,36 @@ def upload_data(upload_path=None):
This requires that it's running in a reasonable Unix environment,
and more notably, that it has the proper SSH keys to upload to that
server.
It should also only be run in Python 3, because otherwise you're probably
uploading the wrong data. We can even ensure this by using features that
are specific to Python 3.
"""
from tempfile import TemporaryDirectory
if upload_path is None:
upload_path = config.UPLOAD_PATH
build_dir = tempfile.mkdtemp('.wordfreq')
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR)
with TemporaryDirectory('.wordfreq') as build_tmp:
build_dir = build_tmp.name
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
logger.info("Copying database file %s" % config.DB_FILENAME)
subprocess.call([
'/bin/cp',
config.DB_FILENAME,
version_dir
])
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s", source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR)
logger.info("Uploading to %s" % upload_path)
subprocess.call([
'/usr/bin/rsync',
'-avz',
version_dir,
upload_path
])
logger.info("Copying database file %s", config.DB_FILENAME)
subprocess.call([
'/bin/cp',
config.DB_FILENAME,
version_dir
])
logger.info("Removing build directory %s" % build_dir)
shutil.rmtree(build_dir)
logger.info("Uploading to %s", upload_path)
subprocess.call([
'/usr/bin/rsync',
'-avz',
version_dir,
upload_path
])