Clear wordlists before inserting them; yell at Python 2

This commit is contained in:
Rob Speer 2013-11-01 19:29:37 -04:00
parent 5c8ba34492
commit 823b3828cd
2 changed files with 36 additions and 3 deletions

View File

@ -26,7 +26,7 @@ def test_build():
tempdir = tempfile.mkdtemp('.wordfreq') tempdir = tempfile.mkdtemp('.wordfreq')
try: try:
db_file = os.path.join(tempdir, 'test.db') db_file = os.path.join(tempdir, 'test.db')
load_all_data(config.RAW_DATA_DIR, db_file) load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
conn = sqlite3.connect(db_file) conn = sqlite3.connect(db_file)
# Compare the information we got to the information in the default DB. # Compare the information we got to the information in the default DB.
@ -43,3 +43,18 @@ def test_build():
eq_(new_info[i], old_info[i]) eq_(new_info[i], old_info[i])
finally: finally:
shutil.rmtree(tempdir) shutil.rmtree(tempdir)
def test_python2():
"""
Python 2 got to skip two tests up there, because we built a slightly
wrong wordlist. Now let's test that, in normal operation, it will refuse
to build this wordlist.
"""
if PYTHON2:
try:
load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
assert False, "The database should not have been built"
except UnicodeError:
# This is the correct case
pass

View File

@ -3,6 +3,7 @@ import sqlite3
import codecs import codecs
import re import re
import os import os
import sys
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
""" """
rows = [(listname, lang, word, freq) rows = [(listname, lang, word, freq)
for word, freq in freqs.items()] for word, freq in freqs.items()]
conn.execute('DELETE FROM words where wordlist=? and lang=?',
(listname, lang))
conn.executemany( conn.executemany(
"INSERT OR REPLACE INTO words (wordlist, lang, word, freq) " "INSERT INTO words (wordlist, lang, word, freq) "
"VALUES (?, ?, ?, ?)", "VALUES (?, ?, ?, ?)",
rows rows
) )
@ -135,10 +138,25 @@ def get_db_connection(filename):
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh') LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
def load_all_data(source_dir=None, filename=None): def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
""" """
Load data from the raw data files into a SQLite database. Load data from the raw data files into a SQLite database.
Python 3 has more complete Unicode support than Python 2, and this shows
up as actual differences in the set of words. For the sake of consistency,
we say that the data is only valid when built on Python 3.
Python 2 can still *use* wordfreq, by downloading the database that was
built on Python 3.
If you insist on building the Python 2 version, pass `do_it_anyway=True`.
""" """
if sys.version_info.major == 2 and not do_it_anyway:
raise UnicodeError(
"Python 2.x has insufficient Unicode support, and will build "
"the wrong database. Pass `do_it_anyway=True` to do it anyway."
)
if source_dir is None: if source_dir is None:
source_dir = config.RAW_DATA_DIR source_dir = config.RAW_DATA_DIR