revise config.py, clarify some of query.py

This commit is contained in:
Robyn Speer 2013-10-29 12:18:38 -04:00
parent a92fed80cf
commit c0ed89c015
2 changed files with 31 additions and 8 deletions

View File

@ -1,8 +1,22 @@
import os
# This directory stores versions of the wordfreq database.
DB_DIR = (os.environ.get('WORDFREQ_DATA')
or os.path.expanduser('~/.cache/wordfreq'))
RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
VERSION = '0.1'
DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
LRU_SIZE = 100000
# Where should raw data go? Inside the package isn't necessary a good
# place for it, because it might be installed in the system site-packages.
#
# The current directory -- as you're running the setup.py script -- seems
# as reasonable as anything.
RAW_DATA_DIR = './wordfreq_data'
# When the minor version number increments, the data may change.
VERSION = '0.1.1'
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
# Put these options together to make a database filename.
DB_FILENAME = os.path.join(DB_DIR, "wordfreq-%s.db" % MINOR_VERSION)
# How many words do we cache the frequencies for?
CACHE_SIZE = 100000

View File

@ -1,4 +1,4 @@
from wordfreq.config import DB_FILENAME, LRU_SIZE
from wordfreq.config import DB_FILENAME, CACHE_SIZE
from functools32 import lru_cache
import sqlite3
@ -19,7 +19,7 @@ except sqlite3.OperationalError:
raise IOError(SQLITE_ERROR_TEXT)
@lru_cache(maxsize=LRU_SIZE)
@lru_cache(maxsize=CACHE_SIZE)
def word_frequency(word, lang, wordlist='multi', default=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
@ -36,14 +36,23 @@ def word_frequency(word, lang, wordlist='multi', default=0.):
else:
return row[0]
# I'm sorry.
METANL_CONSTANT = 50291582140.06433
def metanl_word_frequency(word, lang, default=0.):
"""
Return a word's frequency in a form that matches the output of
metanl 0.6.
In wordfreq, frequencies are proportions. They add up to 1 within a
wordlist and language.
In metanl, we had decided arbitrarily that common words should have a
frequency of a billion or so. There was no real reason.
This function provides compatibility by adapting wordfreq to give the
same output as metanl. It does this by multiplying the word frequency in
the 'multi' list by a big ugly constant. Oh well.
"""
freq = word_frequency(word, lang, 'multi', None)
freq = word_frequency(word, lang, 'multi', default=None)
if freq is None:
return default
else: