mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Add a couple of useful statistics about wordlists
This commit is contained in:
parent
67fefa5dd5
commit
4fc1971b0f
@ -37,7 +37,40 @@ def word_frequency(word, lang, wordlist='multi', default=0.):
|
||||
return row[0]
|
||||
|
||||
|
||||
def iter_wordlist(wordlist, lang=None):
|
||||
def wordlist_size(wordlist, lang=None):
|
||||
"""
|
||||
Get the number of words in a wordlist.
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
if lang is None:
|
||||
c.execute(
|
||||
"SELECT count(*) from words where wordlist=?",
|
||||
(wordlist,)
|
||||
)
|
||||
else:
|
||||
c.execute(
|
||||
"SELECT count(*) from words where wordlist=? and lang=?",
|
||||
(wordlist, lang)
|
||||
)
|
||||
return c.fetchone()[0]
|
||||
|
||||
|
||||
def average_frequency(wordlist, lang):
|
||||
"""
|
||||
A kind of slow function to get the average frequency for words in a
|
||||
wordlist.
|
||||
|
||||
If, for example, you're smoothing over word frequencies by adding the
|
||||
same baseline number to all of them, this can tell you what a good
|
||||
baseline is. (For multi/en, it's 6.7e-07.)
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
|
||||
(wordlist, lang))
|
||||
return c.fetchone()[0]
|
||||
|
||||
|
||||
def iter_wordlist(wordlist='multi', lang=None):
|
||||
"""
|
||||
Returns a generator, yielding (word, lang, frequency) triples from
|
||||
a wordlist in descending order of frequency.
|
||||
|
62
wordfreq/transfer.py
Normal file
62
wordfreq/transfer.py
Normal file
@ -0,0 +1,62 @@
|
||||
import urllib, os, sys
|
||||
import tarfile
|
||||
from wordfreq import config
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.progress = None
|
||||
|
||||
def report_progress(self, count, blockSize, totalSize):
|
||||
percent = int(count*blockSize*100/totalSize)
|
||||
if percent != self.progress:
|
||||
sys.stdout.write("\rDownloading %s... %2d%%" % (self.url, percent))
|
||||
sys.stdout.flush()
|
||||
self.progress = percent
|
||||
|
||||
def finish(self):
|
||||
sys.stdout.write('\n')
|
||||
|
||||
|
||||
def download(url, dest_filename):
|
||||
"""
|
||||
Download the file at `url` to `dest_filename`. Show a progress bar
|
||||
while downloading.
|
||||
"""
|
||||
base_dir = os.path.dirname(dest_filename)
|
||||
if not os.path.exists(base_dir):
|
||||
os.makedirs(base_dir)
|
||||
|
||||
tracker = ProgressTracker(url)
|
||||
urllib.urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
|
||||
tracker.finish()
|
||||
logger.info("Saved database to %s" % dest_filename)
|
||||
return True
|
||||
|
||||
|
||||
def download_and_extract_raw_data(url=None, root_dir=None):
|
||||
if url is None:
|
||||
url = config.RAW_DATA_URL
|
||||
|
||||
if root_dir is None:
|
||||
root_dir = os.path.dirname(config.RAW_DATA_DIR)
|
||||
|
||||
local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
|
||||
download(url, local_filename)
|
||||
|
||||
logger.info("Extracting %s" % local_filename)
|
||||
with tarfile.open(local_filename, 'r') as tarf:
|
||||
tarf.extract_all(root_dir)
|
||||
|
||||
|
||||
def download_db(url=None, target=None):
|
||||
if url is None:
|
||||
url = config.DB_URL
|
||||
|
||||
if target is None:
|
||||
target = config.DB_FILENAME
|
||||
|
||||
download(url, target)
|
Loading…
Reference in New Issue
Block a user