mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Add a couple of useful statistics about wordlists
This commit is contained in:
parent
67fefa5dd5
commit
4fc1971b0f
@ -37,7 +37,40 @@ def word_frequency(word, lang, wordlist='multi', default=0.):
|
|||||||
return row[0]
|
return row[0]
|
||||||
|
|
||||||
|
|
||||||
def iter_wordlist(wordlist, lang=None):
|
def wordlist_size(wordlist, lang=None):
|
||||||
|
"""
|
||||||
|
Get the number of words in a wordlist.
|
||||||
|
"""
|
||||||
|
c = CONN.cursor()
|
||||||
|
if lang is None:
|
||||||
|
c.execute(
|
||||||
|
"SELECT count(*) from words where wordlist=?",
|
||||||
|
(wordlist,)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
c.execute(
|
||||||
|
"SELECT count(*) from words where wordlist=? and lang=?",
|
||||||
|
(wordlist, lang)
|
||||||
|
)
|
||||||
|
return c.fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def average_frequency(wordlist, lang):
|
||||||
|
"""
|
||||||
|
A kind of slow function to get the average frequency for words in a
|
||||||
|
wordlist.
|
||||||
|
|
||||||
|
If, for example, you're smoothing over word frequencies by adding the
|
||||||
|
same baseline number to all of them, this can tell you what a good
|
||||||
|
baseline is. (For multi/en, it's 6.7e-07.)
|
||||||
|
"""
|
||||||
|
c = CONN.cursor()
|
||||||
|
c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
|
||||||
|
(wordlist, lang))
|
||||||
|
return c.fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def iter_wordlist(wordlist='multi', lang=None):
|
||||||
"""
|
"""
|
||||||
Returns a generator, yielding (word, lang, frequency) triples from
|
Returns a generator, yielding (word, lang, frequency) triples from
|
||||||
a wordlist in descending order of frequency.
|
a wordlist in descending order of frequency.
|
||||||
|
62
wordfreq/transfer.py
Normal file
62
wordfreq/transfer.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import urllib, os, sys
|
||||||
|
import tarfile
|
||||||
|
from wordfreq import config
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ProgressTracker(object):
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self.progress = None
|
||||||
|
|
||||||
|
def report_progress(self, count, blockSize, totalSize):
|
||||||
|
percent = int(count*blockSize*100/totalSize)
|
||||||
|
if percent != self.progress:
|
||||||
|
sys.stdout.write("\rDownloading %s... %2d%%" % (self.url, percent))
|
||||||
|
sys.stdout.flush()
|
||||||
|
self.progress = percent
|
||||||
|
|
||||||
|
def finish(self):
|
||||||
|
sys.stdout.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
def download(url, dest_filename):
|
||||||
|
"""
|
||||||
|
Download the file at `url` to `dest_filename`. Show a progress bar
|
||||||
|
while downloading.
|
||||||
|
"""
|
||||||
|
base_dir = os.path.dirname(dest_filename)
|
||||||
|
if not os.path.exists(base_dir):
|
||||||
|
os.makedirs(base_dir)
|
||||||
|
|
||||||
|
tracker = ProgressTracker(url)
|
||||||
|
urllib.urlretrieve(url, dest_filename, reporthook=tracker.report_progress)
|
||||||
|
tracker.finish()
|
||||||
|
logger.info("Saved database to %s" % dest_filename)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_extract_raw_data(url=None, root_dir=None):
|
||||||
|
if url is None:
|
||||||
|
url = config.RAW_DATA_URL
|
||||||
|
|
||||||
|
if root_dir is None:
|
||||||
|
root_dir = os.path.dirname(config.RAW_DATA_DIR)
|
||||||
|
|
||||||
|
local_filename = os.path.join(root_dir, 'wordfreq-data.tar.gz')
|
||||||
|
download(url, local_filename)
|
||||||
|
|
||||||
|
logger.info("Extracting %s" % local_filename)
|
||||||
|
with tarfile.open(local_filename, 'r') as tarf:
|
||||||
|
tarf.extract_all(root_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def download_db(url=None, target=None):
|
||||||
|
if url is None:
|
||||||
|
url = config.DB_URL
|
||||||
|
|
||||||
|
if target is None:
|
||||||
|
target = config.DB_FILENAME
|
||||||
|
|
||||||
|
download(url, target)
|
Loading…
Reference in New Issue
Block a user