Clear wordlists before inserting them; yell at Python 2

Former-commit-id: 823b3828cd
This commit is contained in:
Robyn Speer 2013-11-01 19:29:37 -04:00
parent 5fc933495f
commit 5f7c7e032c
2 changed files with 36 additions and 3 deletions

View File

@ -26,7 +26,7 @@ def test_build():
tempdir = tempfile.mkdtemp('.wordfreq')
try:
db_file = os.path.join(tempdir, 'test.db')
load_all_data(config.RAW_DATA_DIR, db_file)
load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
conn = sqlite3.connect(db_file)
# Compare the information we got to the information in the default DB.
@ -43,3 +43,18 @@ def test_build():
eq_(new_info[i], old_info[i])
finally:
shutil.rmtree(tempdir)
def test_python2():
"""
Python 2 got to skip two tests up there, because we built a slightly
wrong wordlist. Now let's test that, in normal operation, it will refuse
to build this wordlist.
"""
if PYTHON2:
try:
load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
assert False, "The database should not have been built"
except UnicodeError:
# This is the correct case
pass

View File

@ -3,6 +3,7 @@ import sqlite3
import codecs
import re
import os
import sys
import logging
logger = logging.getLogger(__name__)
@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
"""
rows = [(listname, lang, word, freq)
for word, freq in freqs.items()]
conn.execute('DELETE FROM words where wordlist=? and lang=?',
(listname, lang))
conn.executemany(
"INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
"INSERT INTO words (wordlist, lang, word, freq) "
"VALUES (?, ?, ?, ?)",
rows
)
@ -135,10 +138,25 @@ def get_db_connection(filename):
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
def load_all_data(source_dir=None, filename=None):
def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
"""
Load data from the raw data files into a SQLite database.
Python 3 has more complete Unicode support than Python 2, and this shows
up as actual differences in the set of words. For the sake of consistency,
we say that the data is only valid when built on Python 3.
Python 2 can still *use* wordfreq, by downloading the database that was
built on Python 3.
If you insist on building the Python 2 version, pass `do_it_anyway=True`.
"""
if sys.version_info.major == 2 and not do_it_anyway:
raise UnicodeError(
"Python 2.x has insufficient Unicode support, and will build "
"the wrong database. Pass `do_it_anyway=True` to do it anyway."
)
if source_dir is None:
source_dir = config.RAW_DATA_DIR