Implement the data uploady downloady stuff in setup.

This commit is contained in:
Robyn Speer 2013-10-29 16:44:13 -04:00
parent 91a62dbee5
commit a95d88d1b9
4 changed files with 179 additions and 21 deletions

2
.gitignore vendored
View File

@ -6,3 +6,5 @@ dist
pip-log.txt
.coverage
*~
wordfreq_data/
wordfreq-data.tar.gz

View File

@ -1,14 +1,19 @@
#!/usr/bin/env python
from setuptools import setup
from distutils.core import Command
from setuptools.command.install import install
from setuptools.command.develop import develop
# Make sure we can import stuff from here.
import os
import sys
import logging
logging.basicConfig(level=logging.INFO)
# Make sure we can import stuff from here.
current_dir = os.path.dirname(__file__)
sys.path.insert(0, current_dir)
from wordfreq.config import VERSION
from wordfreq import config, transfer
classifiers=[
'Intended Audience :: Developers',
@ -31,9 +36,65 @@ classifiers=[
README_contents = open(os.path.join(current_dir, 'README.txt')).read()
doclines = README_contents.split("\n")
class SimpleCommand(Command):
"""
Get the boilerplate out of the way for commands that take no options.
"""
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
class BuildDatabaseCommand(SimpleCommand):
description = "Build the word frequency database from raw data"
def run(self):
from wordfreq.build import load_all_data
load_all_data()
class DownloadDatabaseCommand(SimpleCommand):
description = "Download the built word frequency database"
user_options = []
def run(self):
transfer.download_db()
class DownloadRawDataCommand(SimpleCommand):
description = "Download the raw wordlist data"
user_options = []
def run(self):
transfer.download_and_extract_raw_data()
class UploadDataCommand(SimpleCommand):
description = "Upload the raw data and database"
user_options = []
def run(self):
transfer.upload_data()
class CustomInstallCommand(install):
def run(self):
install.run(self)
self.run_command('download_db')
class CustomDevelopCommand(develop):
def run(self):
develop.run(self)
self.run_command('download_db')
setup(
name="wordfreq",
version=VERSION,
version=config.VERSION,
maintainer='Luminoso Technologies, Inc.',
maintainer_email='dev@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
@ -44,4 +105,12 @@ setup(
long_description = "\n".join(doclines[2:]),
packages=['wordfreq'],
install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
cmdclass = {
'build_db': BuildDatabaseCommand,
'download_db': DownloadDatabaseCommand,
'download_raw': DownloadRawDataCommand,
'upload_data': UploadDataCommand,
'install': CustomInstallCommand,
'develop': CustomDevelopCommand
}
)

View File

@ -4,13 +4,6 @@ import os
DB_DIR = (os.environ.get('WORDFREQ_DATA')
or os.path.expanduser('~/.cache/wordfreq'))
# Where should raw data go? Inside the package isn't necessary a good
# place for it, because it might be installed in the system site-packages.
#
# The current directory -- as you're running the setup.py script -- seems
# as reasonable as anything.
RAW_DATA_DIR = './wordfreq_data'
# When the minor version number increments, the data may change.
VERSION = '0.1.1'
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
@ -20,3 +13,21 @@ DB_FILENAME = os.path.join(DB_DIR, "wordfreq-%s.db" % MINOR_VERSION)
# How many words do we cache the frequencies for?
CACHE_SIZE = 100000
# Where can the data be downloaded from?
DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
or 'http://ferret.lumi/dist/wordfreq/')
RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
'wordfreq-%s.db' % MINOR_VERSION)
# How do we actually get it there? This is the path, including hostname, to give
# to scp to upload the file.
UPLOAD_PATH = 'ferret.lumi:/var/lib/lumi/dist/wordfreq/'
# Where should raw data go? Inside the package isn't necessary a good
# place for it, because it might be installed in the system site-packages.
#
# The current directory -- as you're running the setup.py script -- seems
# as reasonable as anything.
RAW_DATA_DIR = './wordfreq_data'

View File

@ -1,11 +1,29 @@
import urllib, os, sys
import tarfile
"""
This module provides some functions that help us work with the
wordlist data files, so that they don't have to be stored in Git
or in the source package.
These functions won't be used in the course of using the wordfreq
package normally; instead, they're called from commands in setup.py.
"""
from wordfreq import config
import urllib
import os
import sys
import shutil
import tempfile
import tarfile
import logging
import subprocess
logger = logging.getLogger(__name__)
class ProgressTracker(object):
"""
This class watches the progress of a urllib download task, and updates
sys.stdout when the percentage changes.
"""
def __init__(self, url):
self.url = url
self.progress = None
@ -21,6 +39,16 @@ class ProgressTracker(object):
sys.stdout.write('\n')
def ensure_dir_exists(dest_filename):
"""
Something we'll need to do often: given a filename we want to write to,
make sure its containing directory exists.
"""
base_dir = os.path.dirname(dest_filename)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
def download(url, dest_filename):
"""
Download the file at `url` to `dest_filename`. Show a progress bar
@ -38,25 +66,73 @@ def download(url, dest_filename):
def download_and_extract_raw_data(url=None, root_dir=None):
"""
Download the .tar.gz of raw data that can be used to build the database.
"""
if url is None:
url = config.RAW_DATA_URL
if root_dir is None:
root_dir = os.path.dirname(config.RAW_DATA_DIR)
local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
download(url, local_filename)
dest_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
ensure_dir_exists(dest_filename)
download(url, dest_filename)
logger.info("Extracting %s" % local_filename)
with tarfile.open(local_filename, 'r') as tarf:
tarf.extract_all(root_dir)
logger.info("Extracting %s" % dest_filename)
with tarfile.open(dest_filename, 'r') as tarf:
tarf.extractall(root_dir)
def download_db(url=None, target=None):
def download_db(url=None, dest_filename=None):
"""
Download the database itself, so we don't have to build it.
"""
if url is None:
url = config.DB_URL
if target is None:
target = config.DB_FILENAME
if dest_filename is None:
dest_filename = config.DB_FILENAME
download(url, target)
ensure_dir_exists(dest_filename)
download(url, dest_filename)
def upload_data(upload_path=None):
"""
Collect the raw data and the database file, and upload them to an
appropriate directory on the server that hosts downloads.
This requires that it's running in a reasonable Unix environment,
and more notably, that it has the proper SSH keys to upload to that
server.
"""
if upload_path is None:
upload_path = config.UPLOAD_PATH
build_dir = tempfile.mkdtemp('.wordfreq')
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
os.makedirs(version_dir)
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
logger.info("Creating %s" % source_filename)
with tarfile.open(source_filename, 'w:gz') as tarf:
tarf.add(config.RAW_DATA_DIR)
logger.info("Copying database file %s" % config.DB_FILENAME)
subprocess.call([
'/bin/cp',
config.DB_FILENAME,
version_dir
])
logger.info("Uploading to %s" % upload_path)
subprocess.call([
'/usr/bin/rsync',
'-avz',
version_dir,
upload_path
])
logger.info("Removing build directory %s" % build_dir)
shutil.rmtree(build_dir)