diff --git a/tests/test_build.py b/tests/test_build.py index 31c9c04..56b0977 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -26,7 +26,7 @@ def test_build(): tempdir = tempfile.mkdtemp('.wordfreq') try: db_file = os.path.join(tempdir, 'test.db') - load_all_data(config.RAW_DATA_DIR, db_file) + load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True) conn = sqlite3.connect(db_file) # Compare the information we got to the information in the default DB. @@ -43,3 +43,18 @@ def test_build(): eq_(new_info[i], old_info[i]) finally: shutil.rmtree(tempdir) + + +def test_python2(): + """ + Python 2 got to skip two tests up there, because we built a slightly + wrong wordlist. Now let's test that, in normal operation, it will refuse + to build this wordlist. + """ + if PYTHON2: + try: + load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp()) + assert False, "The database should not have been built" + except UnicodeError: + # This is the correct case + pass diff --git a/wordfreq/build.py b/wordfreq/build.py index d5f1609..486b559 100644 --- a/wordfreq/build.py +++ b/wordfreq/build.py @@ -3,6 +3,7 @@ import sqlite3 import codecs import re import os +import sys import logging logger = logging.getLogger(__name__) @@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs): """ rows = [(listname, lang, word, freq) for word, freq in freqs.items()] + conn.execute('DELETE FROM words where wordlist=? and lang=?', + (listname, lang)) conn.executemany( - "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) " + "INSERT INTO words (wordlist, lang, word, freq) " "VALUES (?, ?, ?, ?)", rows ) @@ -135,10 +138,25 @@ def get_db_connection(filename): LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh') -def load_all_data(source_dir=None, filename=None): +def load_all_data(source_dir=None, filename=None, do_it_anyway=False): """ Load data from the raw data files into a SQLite database. + + Python 3 has more complete Unicode support than Python 2, and this shows + up as actual differences in the set of words. For the sake of consistency, + we say that the data is only valid when built on Python 3. + + Python 2 can still *use* wordfreq, by downloading the database that was + built on Python 3. + + If you insist on building the Python 2 version, pass `do_it_anyway=True`. """ + if sys.version_info.major == 2 and not do_it_anyway: + raise UnicodeError( + "Python 2.x has insufficient Unicode support, and will build " + "the wrong database. Pass `do_it_anyway=True` to do it anyway." + ) + if source_dir is None: source_dir = config.RAW_DATA_DIR