mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Implement the data uploady downloady stuff in setup.
This commit is contained in:
parent
91a62dbee5
commit
a95d88d1b9
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,3 +6,5 @@ dist
|
||||
pip-log.txt
|
||||
.coverage
|
||||
*~
|
||||
wordfreq_data/
|
||||
wordfreq-data.tar.gz
|
||||
|
75
setup.py
75
setup.py
@ -1,14 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
from setuptools import setup
|
||||
from distutils.core import Command
|
||||
from setuptools.command.install import install
|
||||
from setuptools.command.develop import develop
|
||||
|
||||
# Make sure we can import stuff from here.
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Make sure we can import stuff from here.
|
||||
current_dir = os.path.dirname(__file__)
|
||||
sys.path.insert(0, current_dir)
|
||||
|
||||
from wordfreq.config import VERSION
|
||||
from wordfreq import config, transfer
|
||||
|
||||
classifiers=[
|
||||
'Intended Audience :: Developers',
|
||||
@ -31,9 +36,65 @@ classifiers=[
|
||||
README_contents = open(os.path.join(current_dir, 'README.txt')).read()
|
||||
doclines = README_contents.split("\n")
|
||||
|
||||
|
||||
class SimpleCommand(Command):
|
||||
"""
|
||||
Get the boilerplate out of the way for commands that take no options.
|
||||
"""
|
||||
user_options = []
|
||||
|
||||
def initialize_options(self):
|
||||
pass
|
||||
|
||||
def finalize_options(self):
|
||||
pass
|
||||
|
||||
class BuildDatabaseCommand(SimpleCommand):
|
||||
description = "Build the word frequency database from raw data"
|
||||
def run(self):
|
||||
from wordfreq.build import load_all_data
|
||||
load_all_data()
|
||||
|
||||
|
||||
class DownloadDatabaseCommand(SimpleCommand):
|
||||
description = "Download the built word frequency database"
|
||||
user_options = []
|
||||
|
||||
def run(self):
|
||||
transfer.download_db()
|
||||
|
||||
|
||||
class DownloadRawDataCommand(SimpleCommand):
|
||||
description = "Download the raw wordlist data"
|
||||
user_options = []
|
||||
|
||||
def run(self):
|
||||
transfer.download_and_extract_raw_data()
|
||||
|
||||
|
||||
class UploadDataCommand(SimpleCommand):
|
||||
description = "Upload the raw data and database"
|
||||
user_options = []
|
||||
|
||||
def run(self):
|
||||
transfer.upload_data()
|
||||
|
||||
|
||||
class CustomInstallCommand(install):
|
||||
def run(self):
|
||||
install.run(self)
|
||||
self.run_command('download_db')
|
||||
|
||||
|
||||
class CustomDevelopCommand(develop):
|
||||
def run(self):
|
||||
develop.run(self)
|
||||
self.run_command('download_db')
|
||||
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version=VERSION,
|
||||
version=config.VERSION,
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='dev@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
@ -44,4 +105,12 @@ setup(
|
||||
long_description = "\n".join(doclines[2:]),
|
||||
packages=['wordfreq'],
|
||||
install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
|
||||
cmdclass = {
|
||||
'build_db': BuildDatabaseCommand,
|
||||
'download_db': DownloadDatabaseCommand,
|
||||
'download_raw': DownloadRawDataCommand,
|
||||
'upload_data': UploadDataCommand,
|
||||
'install': CustomInstallCommand,
|
||||
'develop': CustomDevelopCommand
|
||||
}
|
||||
)
|
||||
|
@ -4,13 +4,6 @@ import os
|
||||
DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
||||
or os.path.expanduser('~/.cache/wordfreq'))
|
||||
|
||||
# Where should raw data go? Inside the package isn't necessary a good
|
||||
# place for it, because it might be installed in the system site-packages.
|
||||
#
|
||||
# The current directory -- as you're running the setup.py script -- seems
|
||||
# as reasonable as anything.
|
||||
RAW_DATA_DIR = './wordfreq_data'
|
||||
|
||||
# When the minor version number increments, the data may change.
|
||||
VERSION = '0.1.1'
|
||||
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
||||
@ -20,3 +13,21 @@ DB_FILENAME = os.path.join(DB_DIR, "wordfreq-%s.db" % MINOR_VERSION)
|
||||
|
||||
# How many words do we cache the frequencies for?
|
||||
CACHE_SIZE = 100000
|
||||
|
||||
# Where can the data be downloaded from?
|
||||
DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
|
||||
or 'http://ferret.lumi/dist/wordfreq/')
|
||||
RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
|
||||
DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
|
||||
'wordfreq-%s.db' % MINOR_VERSION)
|
||||
|
||||
# How do we actually get it there? This is the path, including hostname, to give
|
||||
# to scp to upload the file.
|
||||
UPLOAD_PATH = 'ferret.lumi:/var/lib/lumi/dist/wordfreq/'
|
||||
|
||||
# Where should raw data go? Inside the package isn't necessary a good
|
||||
# place for it, because it might be installed in the system site-packages.
|
||||
#
|
||||
# The current directory -- as you're running the setup.py script -- seems
|
||||
# as reasonable as anything.
|
||||
RAW_DATA_DIR = './wordfreq_data'
|
||||
|
@ -1,11 +1,29 @@
|
||||
import urllib, os, sys
|
||||
import tarfile
|
||||
"""
|
||||
This module provides some functions that help us work with the
|
||||
wordlist data files, so that they don't have to be stored in Git
|
||||
or in the source package.
|
||||
|
||||
These functions won't be used in the course of using the wordfreq
|
||||
package normally; instead, they're called from commands in setup.py.
|
||||
"""
|
||||
|
||||
from wordfreq import config
|
||||
import urllib
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
import tarfile
|
||||
import logging
|
||||
import subprocess
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
"""
|
||||
This class watches the progress of a urllib download task, and updates
|
||||
sys.stdout when the percentage changes.
|
||||
"""
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.progress = None
|
||||
@ -21,6 +39,16 @@ class ProgressTracker(object):
|
||||
sys.stdout.write('\n')
|
||||
|
||||
|
||||
def ensure_dir_exists(dest_filename):
|
||||
"""
|
||||
Something we'll need to do often: given a filename we want to write to,
|
||||
make sure its containing directory exists.
|
||||
"""
|
||||
base_dir = os.path.dirname(dest_filename)
|
||||
if not os.path.exists(base_dir):
|
||||
os.makedirs(base_dir)
|
||||
|
||||
|
||||
def download(url, dest_filename):
|
||||
"""
|
||||
Download the file at `url` to `dest_filename`. Show a progress bar
|
||||
@ -38,25 +66,73 @@ def download(url, dest_filename):
|
||||
|
||||
|
||||
def download_and_extract_raw_data(url=None, root_dir=None):
|
||||
"""
|
||||
Download the .tar.gz of raw data that can be used to build the database.
|
||||
"""
|
||||
if url is None:
|
||||
url = config.RAW_DATA_URL
|
||||
|
||||
if root_dir is None:
|
||||
root_dir = os.path.dirname(config.RAW_DATA_DIR)
|
||||
|
||||
local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
|
||||
download(url, local_filename)
|
||||
dest_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
|
||||
ensure_dir_exists(dest_filename)
|
||||
download(url, dest_filename)
|
||||
|
||||
logger.info("Extracting %s" % local_filename)
|
||||
with tarfile.open(local_filename, 'r') as tarf:
|
||||
tarf.extract_all(root_dir)
|
||||
logger.info("Extracting %s" % dest_filename)
|
||||
with tarfile.open(dest_filename, 'r') as tarf:
|
||||
tarf.extractall(root_dir)
|
||||
|
||||
|
||||
def download_db(url=None, target=None):
|
||||
def download_db(url=None, dest_filename=None):
|
||||
"""
|
||||
Download the database itself, so we don't have to build it.
|
||||
"""
|
||||
if url is None:
|
||||
url = config.DB_URL
|
||||
|
||||
if target is None:
|
||||
target = config.DB_FILENAME
|
||||
if dest_filename is None:
|
||||
dest_filename = config.DB_FILENAME
|
||||
|
||||
download(url, target)
|
||||
ensure_dir_exists(dest_filename)
|
||||
download(url, dest_filename)
|
||||
|
||||
|
||||
def upload_data(upload_path=None):
|
||||
"""
|
||||
Collect the raw data and the database file, and upload them to an
|
||||
appropriate directory on the server that hosts downloads.
|
||||
|
||||
This requires that it's running in a reasonable Unix environment,
|
||||
and more notably, that it has the proper SSH keys to upload to that
|
||||
server.
|
||||
"""
|
||||
if upload_path is None:
|
||||
upload_path = config.UPLOAD_PATH
|
||||
|
||||
build_dir = tempfile.mkdtemp('.wordfreq')
|
||||
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
|
||||
os.makedirs(version_dir)
|
||||
|
||||
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
|
||||
logger.info("Creating %s" % source_filename)
|
||||
with tarfile.open(source_filename, 'w:gz') as tarf:
|
||||
tarf.add(config.RAW_DATA_DIR)
|
||||
|
||||
logger.info("Copying database file %s" % config.DB_FILENAME)
|
||||
subprocess.call([
|
||||
'/bin/cp',
|
||||
config.DB_FILENAME,
|
||||
version_dir
|
||||
])
|
||||
|
||||
logger.info("Uploading to %s" % upload_path)
|
||||
subprocess.call([
|
||||
'/usr/bin/rsync',
|
||||
'-avz',
|
||||
version_dir,
|
||||
upload_path
|
||||
])
|
||||
|
||||
logger.info("Removing build directory %s" % build_dir)
|
||||
shutil.rmtree(build_dir)
|
||||
|
Loading…
Reference in New Issue
Block a user