code review and pep8 fixes

This commit is contained in:
Rob Speer 2013-11-01 17:05:12 -04:00
parent 2b2bd943d2
commit b4b8ba8be7
3 changed files with 31 additions and 32 deletions

View File

@ -36,7 +36,6 @@ def read_multilingual_csv(filename):
raw_freqs = _read_csv_basic(filename) raw_freqs = _read_csv_basic(filename)
for wordlang in raw_freqs: for wordlang in raw_freqs:
word, lang = wordlang.rsplit('|', 1) word, lang = wordlang.rsplit('|', 1)
word = standardize_word(word)
unscaled[lang][word] = raw_freqs[wordlang] unscaled[lang][word] = raw_freqs[wordlang]
scaled = {} scaled = {}
@ -88,10 +87,7 @@ def _scale_freqs(counts):
""" """
freqs = {} freqs = {}
total = sum(counts.values()) total = sum(counts.values())
for word in counts: return {word: count / total for word, count in freqs.items()}
freqs[word] = counts[word] / total
return freqs
def save_wordlist_to_db(conn, listname, lang, freqs): def save_wordlist_to_db(conn, listname, lang, freqs):
@ -119,11 +115,11 @@ def create_db(filename):
This should be safe to run (and have no effect) if the database already This should be safe to run (and have no effect) if the database already
exists. exists.
""" """
conn = get_db_connection(filename)
base_dir = os.path.dirname(filename) base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir): if not os.path.exists(base_dir):
os.makedirs(base_dir) os.makedirs(base_dir)
conn = get_db_connection(filename)
conn.execute(schema.SCHEMA) conn.execute(schema.SCHEMA)
for index_definition in schema.INDICES: for index_definition in schema.INDICES:
conn.execute(index_definition) conn.execute(index_definition)

View File

@ -17,7 +17,7 @@ CACHE_SIZE = 100000
# Where can the data be downloaded from? # Where can the data be downloaded from?
DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL') DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
or 'http://ferret.lumi/dist/wordfreq/') or 'http://ferret.lumi/dist/wordfreq/')
RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz') RAW_DATA_URL = '/'.join([DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz'])
DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
'wordfreq-%s.db' % MINOR_VERSION) 'wordfreq-%s.db' % MINOR_VERSION)

View File

@ -65,7 +65,7 @@ def download(url, dest_filename):
tracker = ProgressTracker(url) tracker = ProgressTracker(url)
urlretrieve(url, dest_filename, reporthook=tracker.report_progress) urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
tracker.finish() tracker.finish()
logger.info("Saved database to %s" % dest_filename) logger.info("Saved database to %s", dest_filename)
return True return True
@ -83,7 +83,7 @@ def download_and_extract_raw_data(url=None, root_dir=None):
ensure_dir_exists(dest_filename) ensure_dir_exists(dest_filename)
download(url, dest_filename) download(url, dest_filename)
logger.info("Extracting %s" % dest_filename) logger.info("Extracting %s", dest_filename)
with tarfile.open(dest_filename, 'r') as tarf: with tarfile.open(dest_filename, 'r') as tarf:
tarf.extractall(root_dir) tarf.extractall(root_dir)
@ -110,33 +110,36 @@ def upload_data(upload_path=None):
This requires that it's running in a reasonable Unix environment, This requires that it's running in a reasonable Unix environment,
and more notably, that it has the proper SSH keys to upload to that and more notably, that it has the proper SSH keys to upload to that
server. server.
It should also only be run in Python 3, because otherwise you're probably
uploading the wrong data. We can even ensure this by using features that
are specific to Python 3.
""" """
from tempfile import TemporaryDirectory
if upload_path is None: if upload_path is None:
upload_path = config.UPLOAD_PATH upload_path = config.UPLOAD_PATH
build_dir = tempfile.mkdtemp('.wordfreq') with TemporaryDirectory('.wordfreq') as build_tmp:
version_dir = os.path.join(build_dir, config.MINOR_VERSION) build_dir = build_tmp.name
os.makedirs(version_dir) version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename) logger.info("Creating %s", source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf: with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR) tarf.add(config.RAW_DATA_DIR)
logger.info("Copying database file %s" % config.DB_FILENAME) logger.info("Copying database file %s", config.DB_FILENAME)
subprocess.call([ subprocess.call([
'/bin/cp', '/bin/cp',
config.DB_FILENAME, config.DB_FILENAME,
version_dir version_dir
]) ])
logger.info("Uploading to %s" % upload_path) logger.info("Uploading to %s", upload_path)
subprocess.call([ subprocess.call([
'/usr/bin/rsync', '/usr/bin/rsync',
'-avz', '-avz',
version_dir, version_dir,
upload_path upload_path
]) ])
logger.info("Removing build directory %s" % build_dir)
shutil.rmtree(build_dir)