mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
now this package has tests
This commit is contained in:
parent
a95d88d1b9
commit
8a48e57749
23
tests/test_build.py
Normal file
23
tests/test_build.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from wordfreq.build import load_all_data
|
||||||
|
from wordfreq.transfer import download_and_extract_raw_data
|
||||||
|
from wordfreq import config
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
def test_build():
|
||||||
|
"""
|
||||||
|
Ensure that the build process builds the same DB that gets distributed.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(config.RAW_DATA_DIR):
|
||||||
|
download_and_extract_raw_data()
|
||||||
|
|
||||||
|
tempdir = tempfile.mkdtemp('.wordfreq')
|
||||||
|
try:
|
||||||
|
db_file = os.path.join(tempdir, 'test.db')
|
||||||
|
load_all_data(config.RAW_DATA_DIR, db_file)
|
||||||
|
|
||||||
|
assert open(db_file).read() == open(config.DB_FILENAME).read()
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(tempdir)
|
37
tests/test_queries.py
Normal file
37
tests/test_queries.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
from nose.tools import eq_, assert_almost_equal, assert_greater
|
||||||
|
from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
|
||||||
|
get_wordlists)
|
||||||
|
|
||||||
|
|
||||||
|
def test_freq_examples():
|
||||||
|
assert_almost_equal(
|
||||||
|
word_frequency('normalization', 'en', 'google-books'),
|
||||||
|
1.767e-6, places=9
|
||||||
|
)
|
||||||
|
assert_almost_equal(
|
||||||
|
word_frequency('normalisation', 'fr', 'leeds-internet'),
|
||||||
|
4.162e-6, places=9
|
||||||
|
)
|
||||||
|
assert_greater(
|
||||||
|
word_frequency('lol', 'xx', 'twitter'),
|
||||||
|
word_frequency('lol', 'en', 'google-books')
|
||||||
|
)
|
||||||
|
eq_(
|
||||||
|
word_frequency('totallyfakeword', 'en', 'multi', -1),
|
||||||
|
-1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_normalized_frequencies(wordlist, lang):
|
||||||
|
assert_almost_equal(
|
||||||
|
average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),
|
||||||
|
1.0, places=6
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalized_frequencies():
|
||||||
|
for list_info in get_wordlists():
|
||||||
|
wordlist = list_info['wordlist']
|
||||||
|
lang = list_info['lang']
|
||||||
|
yield _check_normalized_frequencies, wordlist, lang
|
@ -111,13 +111,14 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def create_db(conn):
|
def create_db(filename):
|
||||||
"""
|
"""
|
||||||
Create a wordlist database, at the filename specified by `wordfreq.config`.
|
Create a wordlist database, at the filename specified by `wordfreq.config`.
|
||||||
|
|
||||||
This should be safe to run (and have no effect) if the database already
|
This should be safe to run (and have no effect) if the database already
|
||||||
exists.
|
exists.
|
||||||
"""
|
"""
|
||||||
|
conn = get_db_connection(filename)
|
||||||
base_dir = os.path.dirname(filename)
|
base_dir = os.path.dirname(filename)
|
||||||
if not os.path.exists(base_dir):
|
if not os.path.exists(base_dir):
|
||||||
os.makedirs(base_dir)
|
os.makedirs(base_dir)
|
||||||
@ -143,10 +144,10 @@ def load_all_data(source_dir=None, filename=None):
|
|||||||
if filename is None:
|
if filename is None:
|
||||||
filename = config.DB_FILENAME
|
filename = config.DB_FILENAME
|
||||||
|
|
||||||
conn = get_db_connection(filename)
|
|
||||||
logger.info("Creating database")
|
logger.info("Creating database")
|
||||||
create_db(conn)
|
create_db(filename)
|
||||||
|
|
||||||
|
conn = get_db_connection(filename)
|
||||||
logger.info("Loading Leeds internet corpus:")
|
logger.info("Loading Leeds internet corpus:")
|
||||||
for lang in LEEDS_LANGUAGES:
|
for lang in LEEDS_LANGUAGES:
|
||||||
logger.info("\tLanguage: %s" % lang)
|
logger.info("\tLanguage: %s" % lang)
|
||||||
|
@ -95,6 +95,15 @@ def iter_wordlist(wordlist='multi', lang=None):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def get_wordlists():
|
||||||
|
c = CONN.cursor()
|
||||||
|
results = c.execute(
|
||||||
|
"SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
|
||||||
|
)
|
||||||
|
for wordlist, lang, count in results:
|
||||||
|
yield {'wordlist': wordlist, 'lang': lang, 'count': count}
|
||||||
|
|
||||||
|
|
||||||
METANL_CONSTANT = 50291582140.06433
|
METANL_CONSTANT = 50291582140.06433
|
||||||
def metanl_word_frequency(word, lang, default=0.):
|
def metanl_word_frequency(word, lang, default=0.):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user