Add a couple of useful statistics about wordlists

2024-12-23 17:31:41 +00:00 · 2013-10-29 16:42:38 -04:00 · 2013-10-29 16:42:38 -04:00 · 4fc1971b0f
commit 4fc1971b0f
parent 67fefa5dd5
2 changed files with 96 additions and 1 deletions
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@ -37,7 +37,40 @@ def word_frequency(word, lang, wordlist='multi', default=0.):
        return row[0]
-def iter_wordlist(wordlist, lang=None):
+def wordlist_size(wordlist, lang=None):
    """
    Get the number of words in a wordlist.
    """
    c = CONN.cursor()
    if lang is None:
        c.execute(
            "SELECT count(*) from words where wordlist=?",
            (wordlist,)
        )
    else:
        c.execute(
            "SELECT count(*) from words where wordlist=? and lang=?",
            (wordlist, lang)
        )
    return c.fetchone()[0]
 def average_frequency(wordlist, lang):
    """
    A kind of slow function to get the average frequency for words in a
    wordlist.
    If, for example, you're smoothing over word frequencies by adding the
    same baseline number to all of them, this can tell you what a good
    baseline is. (For multi/en, it's 6.7e-07.)
    """
    c = CONN.cursor()
    c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
              (wordlist, lang))
    return c.fetchone()[0]
 def iter_wordlist(wordlist='multi', lang=None):
    """
    Returns a generator, yielding (word, lang, frequency) triples from
    a wordlist in descending order of frequency.
--- a/wordfreq/transfer.py
+++ b/wordfreq/transfer.py
@ -0,0 +1,62 @@
 import urllib, os, sys
 import tarfile
 from wordfreq import config
 import logging
 logger = logging.getLogger(__name__)
 class ProgressTracker(object):
    def __init__(self, url):
        self.url = url
        self.progress = None
    def report_progress(self, count, blockSize, totalSize):
        percent = int(count*blockSize*100/totalSize)
        if percent != self.progress:
            sys.stdout.write("\rDownloading %s... %2d%%" % (self.url, percent))
            sys.stdout.flush()
            self.progress = percent
    def finish(self):
        sys.stdout.write('\n')
 def download(url, dest_filename):
    """
    Download the file at `url` to `dest_filename`. Show a progress bar
    while downloading.
    """
    base_dir = os.path.dirname(dest_filename)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    tracker = ProgressTracker(url)
    urllib.urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
    tracker.finish()
    logger.info("Saved database to %s" % dest_filename)
    return True
 def download_and_extract_raw_data(url=None, root_dir=None):
    if url is None:
        url = config.RAW_DATA_URL
    if root_dir is None:
        root_dir = os.path.dirname(config.RAW_DATA_DIR)
    local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
    download(url, local_filename)
    logger.info("Extracting %s" % local_filename)
    with tarfile.open(local_filename, 'r') as tarf:
        tarf.extract_all(root_dir)
 def download_db(url=None, target=None):
    if url is None:
        url = config.DB_URL
    if target is None:
        target = config.DB_FILENAME
    download(url, target)