Add a couple of useful statistics about wordlists

This commit is contained in:
Robyn Speer 2013-10-29 16:42:38 -04:00
parent 67fefa5dd5
commit 4fc1971b0f
2 changed files with 96 additions and 1 deletions

View File

@ -37,7 +37,40 @@ def word_frequency(word, lang, wordlist='multi', default=0.):
return row[0] return row[0]
def iter_wordlist(wordlist, lang=None): def wordlist_size(wordlist, lang=None):
"""
Get the number of words in a wordlist.
"""
c = CONN.cursor()
if lang is None:
c.execute(
"SELECT count(*) from words where wordlist=?",
(wordlist,)
)
else:
c.execute(
"SELECT count(*) from words where wordlist=? and lang=?",
(wordlist, lang)
)
return c.fetchone()[0]
def average_frequency(wordlist, lang):
"""
A kind of slow function to get the average frequency for words in a
wordlist.
If, for example, you're smoothing over word frequencies by adding the
same baseline number to all of them, this can tell you what a good
baseline is. (For multi/en, it's 6.7e-07.)
"""
c = CONN.cursor()
c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
(wordlist, lang))
return c.fetchone()[0]
def iter_wordlist(wordlist='multi', lang=None):
""" """
Returns a generator, yielding (word, lang, frequency) triples from Returns a generator, yielding (word, lang, frequency) triples from
a wordlist in descending order of frequency. a wordlist in descending order of frequency.

62
wordfreq/transfer.py Normal file
View File

@ -0,0 +1,62 @@
import urllib, os, sys
import tarfile
from wordfreq import config
import logging
logger = logging.getLogger(__name__)
class ProgressTracker(object):
def __init__(self, url):
self.url = url
self.progress = None
def report_progress(self, count, blockSize, totalSize):
percent = int(count*blockSize*100/totalSize)
if percent != self.progress:
sys.stdout.write("\rDownloading %s... %2d%%" % (self.url, percent))
sys.stdout.flush()
self.progress = percent
def finish(self):
sys.stdout.write('\n')
def download(url, dest_filename):
"""
Download the file at `url` to `dest_filename`. Show a progress bar
while downloading.
"""
base_dir = os.path.dirname(dest_filename)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
tracker = ProgressTracker(url)
urllib.urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
tracker.finish()
logger.info("Saved database to %s" % dest_filename)
return True
def download_and_extract_raw_data(url=None, root_dir=None):
if url is None:
url = config.RAW_DATA_URL
if root_dir is None:
root_dir = os.path.dirname(config.RAW_DATA_DIR)
local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
download(url, local_filename)
logger.info("Extracting %s" % local_filename)
with tarfile.open(local_filename, 'r') as tarf:
tarf.extract_all(root_dir)
def download_db(url=None, target=None):
if url is None:
url = config.DB_URL
if target is None:
target = config.DB_FILENAME
download(url, target)