diff --git a/.gitignore b/.gitignore index 53d257f..b3f65b0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ dist pip-log.txt .coverage *~ +wordfreq_data/ +wordfreq-data.tar.gz diff --git a/setup.py b/setup.py index b03c4e7..43bc636 100755 --- a/setup.py +++ b/setup.py @@ -1,14 +1,19 @@ #!/usr/bin/env python from setuptools import setup from distutils.core import Command +from setuptools.command.install import install +from setuptools.command.develop import develop -# Make sure we can import stuff from here. import os import sys +import logging +logging.basicConfig(level=logging.INFO) + +# Make sure we can import stuff from here. current_dir = os.path.dirname(__file__) sys.path.insert(0, current_dir) -from wordfreq.config import VERSION +from wordfreq import config, transfer classifiers=[ 'Intended Audience :: Developers', @@ -31,9 +36,65 @@ classifiers=[ README_contents = open(os.path.join(current_dir, 'README.txt')).read() doclines = README_contents.split("\n") + +class SimpleCommand(Command): + """ + Get the boilerplate out of the way for commands that take no options. + """ + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + +class BuildDatabaseCommand(SimpleCommand): + description = "Build the word frequency database from raw data" + def run(self): + from wordfreq.build import load_all_data + load_all_data() + + +class DownloadDatabaseCommand(SimpleCommand): + description = "Download the built word frequency database" + user_options = [] + + def run(self): + transfer.download_db() + + +class DownloadRawDataCommand(SimpleCommand): + description = "Download the raw wordlist data" + user_options = [] + + def run(self): + transfer.download_and_extract_raw_data() + + +class UploadDataCommand(SimpleCommand): + description = "Upload the raw data and database" + user_options = [] + + def run(self): + transfer.upload_data() + + +class CustomInstallCommand(install): + def run(self): + install.run(self) + self.run_command('download_db') + + +class CustomDevelopCommand(develop): + def run(self): + develop.run(self) + self.run_command('download_db') + + setup( name="wordfreq", - version=VERSION, + version=config.VERSION, maintainer='Luminoso Technologies, Inc.', maintainer_email='dev@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', @@ -44,4 +105,12 @@ setup( long_description = "\n".join(doclines[2:]), packages=['wordfreq'], install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'], + cmdclass = { + 'build_db': BuildDatabaseCommand, + 'download_db': DownloadDatabaseCommand, + 'download_raw': DownloadRawDataCommand, + 'upload_data': UploadDataCommand, + 'install': CustomInstallCommand, + 'develop': CustomDevelopCommand + } ) diff --git a/wordfreq/config.py b/wordfreq/config.py index 4eea8ce..7f38e17 100644 --- a/wordfreq/config.py +++ b/wordfreq/config.py @@ -4,13 +4,6 @@ import os DB_DIR = (os.environ.get('WORDFREQ_DATA') or os.path.expanduser('~/.cache/wordfreq')) -# Where should raw data go? Inside the package isn't necessary a good -# place for it, because it might be installed in the system site-packages. -# -# The current directory -- as you're running the setup.py script -- seems -# as reasonable as anything. -RAW_DATA_DIR = './wordfreq_data' - # When the minor version number increments, the data may change. VERSION = '0.1.1' MINOR_VERSION = '.'.join(VERSION.split('.')[:2]) @@ -20,3 +13,21 @@ DB_FILENAME = os.path.join(DB_DIR, "wordfreq-%s.db" % MINOR_VERSION) # How many words do we cache the frequencies for? CACHE_SIZE = 100000 + +# Where can the data be downloaded from? +DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL') + or 'http://ferret.lumi/dist/wordfreq/') +RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz') +DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, + 'wordfreq-%s.db' % MINOR_VERSION) + +# How do we actually get it there? This is the path, including hostname, to give +# to scp to upload the file. +UPLOAD_PATH = 'ferret.lumi:/var/lib/lumi/dist/wordfreq/' + +# Where should raw data go? Inside the package isn't necessary a good +# place for it, because it might be installed in the system site-packages. +# +# The current directory -- as you're running the setup.py script -- seems +# as reasonable as anything. +RAW_DATA_DIR = './wordfreq_data' diff --git a/wordfreq/transfer.py b/wordfreq/transfer.py index 1854dce..76fc2c6 100644 --- a/wordfreq/transfer.py +++ b/wordfreq/transfer.py @@ -1,11 +1,29 @@ -import urllib, os, sys -import tarfile +""" +This module provides some functions that help us work with the +wordlist data files, so that they don't have to be stored in Git +or in the source package. + +These functions won't be used in the course of using the wordfreq +package normally; instead, they're called from commands in setup.py. +""" + from wordfreq import config +import urllib +import os +import sys +import shutil +import tempfile +import tarfile import logging +import subprocess logger = logging.getLogger(__name__) class ProgressTracker(object): + """ + This class watches the progress of a urllib download task, and updates + sys.stdout when the percentage changes. + """ def __init__(self, url): self.url = url self.progress = None @@ -21,6 +39,16 @@ class ProgressTracker(object): sys.stdout.write('\n') +def ensure_dir_exists(dest_filename): + """ + Something we'll need to do often: given a filename we want to write to, + make sure its containing directory exists. + """ + base_dir = os.path.dirname(dest_filename) + if not os.path.exists(base_dir): + os.makedirs(base_dir) + + def download(url, dest_filename): """ Download the file at `url` to `dest_filename`. Show a progress bar @@ -38,25 +66,73 @@ def download(url, dest_filename): def download_and_extract_raw_data(url=None, root_dir=None): + """ + Download the .tar.gz of raw data that can be used to build the database. + """ if url is None: url = config.RAW_DATA_URL if root_dir is None: root_dir = os.path.dirname(config.RAW_DATA_DIR) - local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz') - download(url, local_filename) + dest_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz') + ensure_dir_exists(dest_filename) + download(url, dest_filename) - logger.info("Extracting %s" % local_filename) - with tarfile.open(local_filename, 'r') as tarf: - tarf.extract_all(root_dir) + logger.info("Extracting %s" % dest_filename) + with tarfile.open(dest_filename, 'r') as tarf: + tarf.extractall(root_dir) -def download_db(url=None, target=None): +def download_db(url=None, dest_filename=None): + """ + Download the database itself, so we don't have to build it. + """ if url is None: url = config.DB_URL - if target is None: - target = config.DB_FILENAME + if dest_filename is None: + dest_filename = config.DB_FILENAME - download(url, target) + ensure_dir_exists(dest_filename) + download(url, dest_filename) + + +def upload_data(upload_path=None): + """ + Collect the raw data and the database file, and upload them to an + appropriate directory on the server that hosts downloads. + + This requires that it's running in a reasonable Unix environment, + and more notably, that it has the proper SSH keys to upload to that + server. + """ + if upload_path is None: + upload_path = config.UPLOAD_PATH + + build_dir = tempfile.mkdtemp('.wordfreq') + version_dir = os.path.join(build_dir, config.MINOR_VERSION) + os.makedirs(version_dir) + + source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz') + logger.info("Creating %s" % source_filename) + with tarfile.open(source_filename, 'w:gz') as tarf: + tarf.add(config.RAW_DATA_DIR) + + logger.info("Copying database file %s" % config.DB_FILENAME) + subprocess.call([ + '/bin/cp', + config.DB_FILENAME, + version_dir + ]) + + logger.info("Uploading to %s" % upload_path) + subprocess.call([ + '/usr/bin/rsync', + '-avz', + version_dir, + upload_path + ]) + + logger.info("Removing build directory %s" % build_dir) + shutil.rmtree(build_dir)