Implement the data uploady downloady stuff in setup.

2024-12-24 09:51:38 +00:00 · 2013-10-29 16:44:13 -04:00 · 2013-10-29 16:44:13 -04:00 · ca5b3e2f5d
commit ca5b3e2f5d
parent 793893e738
4 changed files with 179 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,5 @@ dist
 pip-log.txt
 .coverage
 *~
+wordfreq_data/
+wordfreq-data.tar.gz
--- a/setup.py
+++ b/setup.py
@ -1,14 +1,19 @@
 #!/usr/bin/env python
 from setuptools import setup
 from distutils.core import Command
+from setuptools.command.install import install
+from setuptools.command.develop import develop

-# Make sure we can import stuff from here.
 import os
 import sys
+import logging
+logging.basicConfig(level=logging.INFO)
+
+# Make sure we can import stuff from here.
 current_dir = os.path.dirname(__file__)
 sys.path.insert(0, current_dir)

-from wordfreq.config import VERSION
+from wordfreq import config, transfer

 classifiers=[
    'Intended Audience :: Developers',
@ -31,9 +36,65 @@ classifiers=[
 README_contents = open(os.path.join(current_dir, 'README.txt')).read()
 doclines = README_contents.split("\n")

+
+class SimpleCommand(Command):
+    """
+    Get the boilerplate out of the way for commands that take no options.
+    """
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+class BuildDatabaseCommand(SimpleCommand):
+    description = "Build the word frequency database from raw data"
+    def run(self):
+        from wordfreq.build import load_all_data
+        load_all_data()
+
+
+class DownloadDatabaseCommand(SimpleCommand):
+    description = "Download the built word frequency database"
+    user_options = []
+
+    def run(self):
+        transfer.download_db()
+
+
+class DownloadRawDataCommand(SimpleCommand):
+    description = "Download the raw wordlist data"
+    user_options = []
+
+    def run(self):
+        transfer.download_and_extract_raw_data()
+
+
+class UploadDataCommand(SimpleCommand):
+    description = "Upload the raw data and database"
+    user_options = []
+
+    def run(self):
+        transfer.upload_data()
+
+
+class CustomInstallCommand(install):
+    def run(self):
+        install.run(self)
+        self.run_command('download_db')
+
+
+class CustomDevelopCommand(develop):
+    def run(self):
+        develop.run(self)
+        self.run_command('download_db')
+
+
 setup(
    name="wordfreq",
-    version=VERSION,
+    version=config.VERSION,
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='dev@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -44,4 +105,12 @@ setup(
    long_description = "\n".join(doclines[2:]),
    packages=['wordfreq'],
    install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
+    cmdclass = {
+        'build_db': BuildDatabaseCommand,
+        'download_db': DownloadDatabaseCommand,
+        'download_raw': DownloadRawDataCommand,
+        'upload_data': UploadDataCommand,
+        'install': CustomInstallCommand,
+        'develop': CustomDevelopCommand
+    }
 )
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@ -4,13 +4,6 @@ import os
 DB_DIR = (os.environ.get('WORDFREQ_DATA')
          or os.path.expanduser('~/.cache/wordfreq'))

-# Where should raw data go? Inside the package isn't necessary a good
-# place for it, because it might be installed in the system site-packages.
-#
-# The current directory -- as you're running the setup.py script -- seems
-# as reasonable as anything.
-RAW_DATA_DIR = './wordfreq_data'
-
 # When the minor version number increments, the data may change.
 VERSION = '0.1.1'
 MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
@ -20,3 +13,21 @@ DB_FILENAME = os.path.join(DB_DIR, "wordfreq-%s.db" % MINOR_VERSION)

 # How many words do we cache the frequencies for?
 CACHE_SIZE = 100000
+
+# Where can the data be downloaded from?
+DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
+                or 'http://ferret.lumi/dist/wordfreq/')
+RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
+DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
+                      'wordfreq-%s.db' % MINOR_VERSION)
+
+# How do we actually get it there? This is the path, including hostname, to give
+# to scp to upload the file.
+UPLOAD_PATH = 'ferret.lumi:/var/lib/lumi/dist/wordfreq/'
+
+# Where should raw data go? Inside the package isn't necessary a good
+# place for it, because it might be installed in the system site-packages.
+#
+# The current directory -- as you're running the setup.py script -- seems
+# as reasonable as anything.
+RAW_DATA_DIR = './wordfreq_data'
--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@ -1,11 +1,29 @@
-import urllib, os, sys
-import tarfile
+"""
+This module provides some functions that help us work with the
+wordlist data files, so that they don't have to be stored in Git
+or in the source package.
+
+These functions won't be used in the course of using the wordfreq
+package normally; instead, they're called from commands in setup.py.
+"""
+
 from wordfreq import config
+import urllib
+import os
+import sys
+import shutil
+import tempfile
+import tarfile
 import logging
+import subprocess
 logger = logging.getLogger(__name__)


 class ProgressTracker(object):
+    """
+    This class watches the progress of a urllib download task, and updates
+    sys.stdout when the percentage changes.
+    """
    def __init__(self, url):
        self.url = url
        self.progress = None
@ -21,6 +39,16 @@ class ProgressTracker(object):
        sys.stdout.write('\n')


+def ensure_dir_exists(dest_filename):
+    """
+    Something we'll need to do often: given a filename we want to write to,
+    make sure its containing directory exists.
+    """
+    base_dir = os.path.dirname(dest_filename)
+    if not os.path.exists(base_dir):
+        os.makedirs(base_dir)
+
+
 def download(url, dest_filename):
    """
    Download the file at `url` to `dest_filename`. Show a progress bar
@ -38,25 +66,73 @@ def download(url, dest_filename):


 def download_and_extract_raw_data(url=None, root_dir=None):
+    """
+    Download the .tar.gz of raw data that can be used to build the database.
+    """
    if url is None:
        url = config.RAW_DATA_URL

    if root_dir is None:
        root_dir = os.path.dirname(config.RAW_DATA_DIR)

-    local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
-    download(url, local_filename)
+    dest_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
+    ensure_dir_exists(dest_filename)
+    download(url, dest_filename)

-    logger.info("Extracting %s" % local_filename)
-    with tarfile.open(local_filename, 'r') as tarf:
-        tarf.extract_all(root_dir)
+    logger.info("Extracting %s" % dest_filename)
+    with tarfile.open(dest_filename, 'r') as tarf:
+        tarf.extractall(root_dir)


-def download_db(url=None, target=None):
+def download_db(url=None, dest_filename=None):
+    """
+    Download the database itself, so we don't have to build it.
+    """
    if url is None:
        url = config.DB_URL

-    if target is None:
-        target = config.DB_FILENAME
+    if dest_filename is None:
+        dest_filename = config.DB_FILENAME

-    download(url, target)
+    ensure_dir_exists(dest_filename)
+    download(url, dest_filename)
+
+
+def upload_data(upload_path=None):
+    """
+    Collect the raw data and the database file, and upload them to an
+    appropriate directory on the server that hosts downloads.
+
+    This requires that it's running in a reasonable Unix environment,
+    and more notably, that it has the proper SSH keys to upload to that
+    server.
+    """
+    if upload_path is None:
+        upload_path = config.UPLOAD_PATH
+    
+    build_dir = tempfile.mkdtemp('.wordfreq')
+    version_dir = os.path.join(build_dir, config.MINOR_VERSION)
+    os.makedirs(version_dir)
+
+    source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
+    logger.info("Creating %s" % source_filename)
+    with tarfile.open(source_filename, 'w:gz') as tarf:
+        tarf.add(config.RAW_DATA_DIR)
+
+    logger.info("Copying database file %s" % config.DB_FILENAME)
+    subprocess.call([
+        '/bin/cp',
+        config.DB_FILENAME,
+        version_dir
+    ])
+
+    logger.info("Uploading to %s" % upload_path)
+    subprocess.call([
+        '/usr/bin/rsync',
+        '-avz',
+        version_dir,
+        upload_path
+    ])
+
+    logger.info("Removing build directory %s" % build_dir)
+    shutil.rmtree(build_dir)