mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
revise config.py, clarify some of query.py
This commit is contained in:
parent
a92fed80cf
commit
c0ed89c015
@ -1,8 +1,22 @@
|
||||
import os
|
||||
|
||||
# This directory stores versions of the wordfreq database.
|
||||
DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
||||
or os.path.expanduser('~/.cache/wordfreq'))
|
||||
RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
|
||||
VERSION = '0.1'
|
||||
DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
|
||||
LRU_SIZE = 100000
|
||||
|
||||
# Where should raw data go? Inside the package isn't necessary a good
|
||||
# place for it, because it might be installed in the system site-packages.
|
||||
#
|
||||
# The current directory -- as you're running the setup.py script -- seems
|
||||
# as reasonable as anything.
|
||||
RAW_DATA_DIR = './wordfreq_data'
|
||||
|
||||
# When the minor version number increments, the data may change.
|
||||
VERSION = '0.1.1'
|
||||
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
||||
|
||||
# Put these options together to make a database filename.
|
||||
DB_FILENAME = os.path.join(DB_DIR, "wordfreq-%s.db" % MINOR_VERSION)
|
||||
|
||||
# How many words do we cache the frequencies for?
|
||||
CACHE_SIZE = 100000
|
||||
|
@ -1,4 +1,4 @@
|
||||
from wordfreq.config import DB_FILENAME, LRU_SIZE
|
||||
from wordfreq.config import DB_FILENAME, CACHE_SIZE
|
||||
from functools32 import lru_cache
|
||||
import sqlite3
|
||||
|
||||
@ -19,7 +19,7 @@ except sqlite3.OperationalError:
|
||||
raise IOError(SQLITE_ERROR_TEXT)
|
||||
|
||||
|
||||
@lru_cache(maxsize=LRU_SIZE)
|
||||
@lru_cache(maxsize=CACHE_SIZE)
|
||||
def word_frequency(word, lang, wordlist='multi', default=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
@ -36,14 +36,23 @@ def word_frequency(word, lang, wordlist='multi', default=0.):
|
||||
else:
|
||||
return row[0]
|
||||
|
||||
# I'm sorry.
|
||||
METANL_CONSTANT = 50291582140.06433
|
||||
def metanl_word_frequency(word, lang, default=0.):
|
||||
"""
|
||||
Return a word's frequency in a form that matches the output of
|
||||
metanl 0.6.
|
||||
|
||||
In wordfreq, frequencies are proportions. They add up to 1 within a
|
||||
wordlist and language.
|
||||
|
||||
In metanl, we had decided arbitrarily that common words should have a
|
||||
frequency of a billion or so. There was no real reason.
|
||||
|
||||
This function provides compatibility by adapting wordfreq to give the
|
||||
same output as metanl. It does this by multiplying the word frequency in
|
||||
the 'multi' list by a big ugly constant. Oh well.
|
||||
"""
|
||||
freq = word_frequency(word, lang, 'multi', None)
|
||||
freq = word_frequency(word, lang, 'multi', default=None)
|
||||
if freq is None:
|
||||
return default
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user