mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Revert "code review and pep8 fixes"
This reverts commitae6e03fa06
[formerlyb4b8ba8be7
]. Conflicts: wordfreq/transfer.py Former-commit-id:5c8ba34492
This commit is contained in:
parent
4d904a3bae
commit
5fc933495f
@ -36,6 +36,7 @@ def read_multilingual_csv(filename):
|
|||||||
raw_freqs = _read_csv_basic(filename)
|
raw_freqs = _read_csv_basic(filename)
|
||||||
for wordlang in raw_freqs:
|
for wordlang in raw_freqs:
|
||||||
word, lang = wordlang.rsplit('|', 1)
|
word, lang = wordlang.rsplit('|', 1)
|
||||||
|
word = standardize_word(word)
|
||||||
unscaled[lang][word] = raw_freqs[wordlang]
|
unscaled[lang][word] = raw_freqs[wordlang]
|
||||||
|
|
||||||
scaled = {}
|
scaled = {}
|
||||||
@ -87,7 +88,10 @@ def _scale_freqs(counts):
|
|||||||
"""
|
"""
|
||||||
freqs = {}
|
freqs = {}
|
||||||
total = sum(counts.values())
|
total = sum(counts.values())
|
||||||
return {word: count / total for word, count in freqs.items()}
|
for word in counts:
|
||||||
|
freqs[word] = counts[word] / total
|
||||||
|
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
def save_wordlist_to_db(conn, listname, lang, freqs):
|
def save_wordlist_to_db(conn, listname, lang, freqs):
|
||||||
@ -115,11 +119,11 @@ def create_db(filename):
|
|||||||
This should be safe to run (and have no effect) if the database already
|
This should be safe to run (and have no effect) if the database already
|
||||||
exists.
|
exists.
|
||||||
"""
|
"""
|
||||||
|
conn = get_db_connection(filename)
|
||||||
base_dir = os.path.dirname(filename)
|
base_dir = os.path.dirname(filename)
|
||||||
if not os.path.exists(base_dir):
|
if not os.path.exists(base_dir):
|
||||||
os.makedirs(base_dir)
|
os.makedirs(base_dir)
|
||||||
|
|
||||||
conn = get_db_connection(filename)
|
|
||||||
conn.execute(schema.SCHEMA)
|
conn.execute(schema.SCHEMA)
|
||||||
for index_definition in schema.INDICES:
|
for index_definition in schema.INDICES:
|
||||||
conn.execute(index_definition)
|
conn.execute(index_definition)
|
||||||
|
@ -17,7 +17,7 @@ CACHE_SIZE = 100000
|
|||||||
# Where can the data be downloaded from?
|
# Where can the data be downloaded from?
|
||||||
DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
|
DOWNLOAD_URL = (os.environ.get('WORDFREQ_URL')
|
||||||
or 'http://ferret.lumi/dist/wordfreq/')
|
or 'http://ferret.lumi/dist/wordfreq/')
|
||||||
RAW_DATA_URL = '/'.join([DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz'])
|
RAW_DATA_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION, 'wordfreq-data.tar.gz')
|
||||||
DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
|
DB_URL = os.path.join(DOWNLOAD_URL, MINOR_VERSION,
|
||||||
'wordfreq-%s.db' % MINOR_VERSION)
|
'wordfreq-%s.db' % MINOR_VERSION)
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ def download_and_extract_raw_data(url=None, root_dir=None):
|
|||||||
ensure_dir_exists(dest_filename)
|
ensure_dir_exists(dest_filename)
|
||||||
download(url, dest_filename)
|
download(url, dest_filename)
|
||||||
|
|
||||||
logger.info("Extracting %s", dest_filename)
|
logger.info("Extracting %s" % dest_filename)
|
||||||
with tarfile.open(dest_filename, 'r') as tarf:
|
with tarfile.open(dest_filename, 'r') as tarf:
|
||||||
tarf.extractall(root_dir)
|
tarf.extractall(root_dir)
|
||||||
|
|
||||||
@ -106,36 +106,33 @@ def upload_data(upload_path=None):
|
|||||||
This requires that it's running in a reasonable Unix environment,
|
This requires that it's running in a reasonable Unix environment,
|
||||||
and more notably, that it has the proper SSH keys to upload to that
|
and more notably, that it has the proper SSH keys to upload to that
|
||||||
server.
|
server.
|
||||||
|
|
||||||
It should also only be run in Python 3, because otherwise you're probably
|
|
||||||
uploading the wrong data. We can even ensure this by using features that
|
|
||||||
are specific to Python 3.
|
|
||||||
"""
|
"""
|
||||||
from tempfile import TemporaryDirectory
|
|
||||||
if upload_path is None:
|
if upload_path is None:
|
||||||
upload_path = config.UPLOAD_PATH
|
upload_path = config.UPLOAD_PATH
|
||||||
|
|
||||||
|
build_dir = tempfile.mkdtemp('.wordfreq')
|
||||||
|
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
|
||||||
|
os.makedirs(version_dir)
|
||||||
|
|
||||||
with TemporaryDirectory('.wordfreq') as build_tmp:
|
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
|
||||||
build_dir = build_tmp.name
|
logger.info("Creating %s" % source_filename)
|
||||||
version_dir = os.path.join(build_dir, config.MINOR_VERSION)
|
with tarfile.open(source_filename, 'w:gz') as tarf:
|
||||||
os.makedirs(version_dir)
|
tarf.add(config.RAW_DATA_DIR)
|
||||||
|
|
||||||
source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
|
logger.info("Copying database file %s" % config.DB_FILENAME)
|
||||||
logger.info("Creating %s", source_filename)
|
subprocess.call([
|
||||||
with tarfile.open(source_filename, 'w:gz') as tarf:
|
'/bin/cp',
|
||||||
tarf.add(config.RAW_DATA_DIR)
|
config.DB_FILENAME,
|
||||||
|
version_dir
|
||||||
|
])
|
||||||
|
|
||||||
logger.info("Copying database file %s", config.DB_FILENAME)
|
logger.info("Uploading to %s" % upload_path)
|
||||||
subprocess.call([
|
subprocess.call([
|
||||||
'/bin/cp',
|
'/usr/bin/rsync',
|
||||||
config.DB_FILENAME,
|
'-avz',
|
||||||
version_dir
|
version_dir,
|
||||||
])
|
upload_path
|
||||||
|
])
|
||||||
|
|
||||||
logger.info("Uploading to %s", upload_path)
|
logger.info("Removing build directory %s" % build_dir)
|
||||||
subprocess.call([
|
shutil.rmtree(build_dir)
|
||||||
'/usr/bin/rsync',
|
|
||||||
'-avz',
|
|
||||||
version_dir,
|
|
||||||
upload_path
|
|
||||||
])
|
|
||||||
|
Loading…
Reference in New Issue
Block a user