mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
cleanups to building and uploading, from code review
This commit is contained in:
parent
cb7b2b76e6
commit
5dee417302
@ -37,13 +37,9 @@ def read_multilingual_csv(filename):
|
|||||||
raw_freqs = _read_csv_basic(filename)
|
raw_freqs = _read_csv_basic(filename)
|
||||||
for wordlang in raw_freqs:
|
for wordlang in raw_freqs:
|
||||||
word, lang = wordlang.rsplit('|', 1)
|
word, lang = wordlang.rsplit('|', 1)
|
||||||
word = standardize_word(word)
|
|
||||||
unscaled[lang][word] = raw_freqs[wordlang]
|
unscaled[lang][word] = raw_freqs[wordlang]
|
||||||
|
|
||||||
scaled = {}
|
return {key: _scale_freqs(unscaled[key]) for key in unscaled}
|
||||||
for key in unscaled:
|
|
||||||
scaled[key] = _scale_freqs(unscaled[key])
|
|
||||||
return scaled
|
|
||||||
|
|
||||||
|
|
||||||
def _read_csv_basic(filename):
|
def _read_csv_basic(filename):
|
||||||
@ -87,12 +83,8 @@ def _scale_freqs(counts):
|
|||||||
Take in unscaled word counts or frequencies, and scale them so that
|
Take in unscaled word counts or frequencies, and scale them so that
|
||||||
they add up to 1.0.
|
they add up to 1.0.
|
||||||
"""
|
"""
|
||||||
freqs = {}
|
|
||||||
total = sum(counts.values())
|
total = sum(counts.values())
|
||||||
for word in counts:
|
return {word: counts[word] / total for word in counts}
|
||||||
freqs[word] = counts[word] / total
|
|
||||||
|
|
||||||
return freqs
|
|
||||||
|
|
||||||
|
|
||||||
def save_wordlist_to_db(conn, listname, lang, freqs):
|
def save_wordlist_to_db(conn, listname, lang, freqs):
|
||||||
@ -122,15 +114,17 @@ def create_db(filename):
|
|||||||
This should be safe to run (and have no effect) if the database already
|
This should be safe to run (and have no effect) if the database already
|
||||||
exists.
|
exists.
|
||||||
"""
|
"""
|
||||||
conn = get_db_connection(filename)
|
|
||||||
base_dir = os.path.dirname(filename)
|
base_dir = os.path.dirname(filename)
|
||||||
if not os.path.exists(base_dir):
|
if not os.path.exists(base_dir):
|
||||||
os.makedirs(base_dir)
|
os.makedirs(base_dir)
|
||||||
|
|
||||||
|
conn = get_db_connection(filename)
|
||||||
|
|
||||||
conn.execute(schema.SCHEMA)
|
conn.execute(schema.SCHEMA)
|
||||||
for index_definition in schema.INDICES:
|
for index_definition in schema.INDICES:
|
||||||
conn.execute(index_definition)
|
conn.execute(index_definition)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
def get_db_connection(filename):
|
def get_db_connection(filename):
|
||||||
@ -164,9 +158,8 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
|||||||
filename = config.DB_FILENAME
|
filename = config.DB_FILENAME
|
||||||
|
|
||||||
logger.info("Creating database")
|
logger.info("Creating database")
|
||||||
create_db(filename)
|
conn = create_db(filename)
|
||||||
|
|
||||||
conn = get_db_connection(filename)
|
|
||||||
logger.info("Loading Leeds internet corpus:")
|
logger.info("Loading Leeds internet corpus:")
|
||||||
for lang in LEEDS_LANGUAGES:
|
for lang in LEEDS_LANGUAGES:
|
||||||
logger.info("\tLanguage: %s" % lang)
|
logger.info("\tLanguage: %s" % lang)
|
||||||
|
@ -10,16 +10,16 @@ package normally; instead, they're called from commands in setup.py.
|
|||||||
from wordfreq import config
|
from wordfreq import config
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
|
||||||
import tempfile
|
|
||||||
import tarfile
|
import tarfile
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
if sys.version_info.major == 2:
|
if sys.version_info.major == 2:
|
||||||
|
PY2 = True
|
||||||
from urllib import urlretrieve
|
from urllib import urlretrieve
|
||||||
else:
|
else:
|
||||||
|
PY2 = False
|
||||||
from urllib.request import urlretrieve
|
from urllib.request import urlretrieve
|
||||||
|
|
||||||
|
|
||||||
@ -103,36 +103,35 @@ def upload_data(upload_path=None):
|
|||||||
Collect the raw data and the database file, and upload them to an
|
Collect the raw data and the database file, and upload them to an
|
||||||
appropriate directory on the server that hosts downloads.
|
appropriate directory on the server that hosts downloads.
|
||||||
|
|
||||||
This requires that it's running in a reasonable Unix environment,
|
This requires that it's running in a reasonable Unix environment, on Python
|
||||||
and more notably, that it has the proper SSH keys to upload to that
|
3, and more notably, that it has the proper SSH keys to upload to that
|
||||||
server.
|
server.
|
||||||
"""
|
"""
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
if upload_path is None:
|
if upload_path is None:
|
||||||
upload_path = config.UPLOAD_PATH
|
upload_path = config.UPLOAD_PATH
|
||||||
|
|
||||||
build_dir = tempfile.mkdtemp('.wordfreq')
|
with TemporaryDirectory('.wordfreq') as build_dir:
|
||||||
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
|
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
|
||||||
os.makedirs(version_dir)
|
os.makedirs(version_dir)
|
||||||
|
|
||||||
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
|
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
|
||||||
logger.info("Creating %s" % source_filename)
|
logger.info("Creating %s" % source_filename)
|
||||||
with tarfile.open(source_filename, 'w:gz') as tarf:
|
with tarfile.open(source_filename, 'w:gz') as tarf:
|
||||||
tarf.add(config.RAW_DATA_DIR)
|
tarf.add(config.RAW_DATA_DIR)
|
||||||
|
|
||||||
logger.info("Copying database file %s" % config.DB_FILENAME)
|
logger.info("Copying database file %s" % config.DB_FILENAME)
|
||||||
subprocess.call([
|
subprocess.call([
|
||||||
'/bin/cp',
|
'/bin/cp',
|
||||||
config.DB_FILENAME,
|
config.DB_FILENAME,
|
||||||
version_dir
|
version_dir
|
||||||
])
|
])
|
||||||
|
|
||||||
logger.info("Uploading to %s" % upload_path)
|
logger.info("Uploading to %s" % upload_path)
|
||||||
subprocess.call([
|
subprocess.call([
|
||||||
'/usr/bin/rsync',
|
'/usr/bin/rsync',
|
||||||
'-avz',
|
'-avz',
|
||||||
version_dir,
|
version_dir,
|
||||||
upload_path
|
upload_path
|
||||||
])
|
])
|
||||||
|
|
||||||
logger.info("Removing build directory %s" % build_dir)
|
|
||||||
shutil.rmtree(build_dir)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user