Clear wordlists before inserting them; yell at Python 2

2024-12-24 01:41:39 +00:00 · 2013-11-01 19:29:37 -04:00 · 2013-11-01 19:29:37 -04:00 · 823b3828cd
commit 823b3828cd
parent 5c8ba34492
2 changed files with 36 additions and 3 deletions
--- a/tests/test_build.py
+++ b/tests/test_build.py
@ -26,7 +26,7 @@ def test_build():
    tempdir = tempfile.mkdtemp('.wordfreq')
    try:
        db_file = os.path.join(tempdir, 'test.db')
-        load_all_data(config.RAW_DATA_DIR, db_file)
+        load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
        conn = sqlite3.connect(db_file)
        # Compare the information we got to the information in the default DB.
@ -43,3 +43,18 @@ def test_build():
            eq_(new_info[i], old_info[i])
    finally:
        shutil.rmtree(tempdir)
 def test_python2():
    """
    Python 2 got to skip two tests up there, because we built a slightly
    wrong wordlist. Now let's test that, in normal operation, it will refuse
    to build this wordlist.
    """
    if PYTHON2:
        try:
            load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
            assert False, "The database should not have been built"
        except UnicodeError:
            # This is the correct case
            pass
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -3,6 +3,7 @@ import sqlite3
 import codecs
 import re
 import os
 import sys
 import logging
 logger = logging.getLogger(__name__)
@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
    """
    rows = [(listname, lang, word, freq)
            for word, freq in freqs.items()]
    conn.execute('DELETE FROM words where wordlist=? and lang=?',
                 (listname, lang))
    conn.executemany(
-        "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
+        "INSERT INTO words (wordlist, lang, word, freq) "
        "VALUES (?, ?, ?, ?)",
        rows
    )
@ -135,10 +138,25 @@ def get_db_connection(filename):
 LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
-def load_all_data(source_dir=None, filename=None):
+def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
    """
    Load data from the raw data files into a SQLite database.
    Python 3 has more complete Unicode support than Python 2, and this shows
    up as actual differences in the set of words. For the sake of consistency,
    we say that the data is only valid when built on Python 3.
    Python 2 can still *use* wordfreq, by downloading the database that was
    built on Python 3.
    If you insist on building the Python 2 version, pass `do_it_anyway=True`.
    """
    if sys.version_info.major == 2 and not do_it_anyway:
        raise UnicodeError(
            "Python 2.x has insufficient Unicode support, and will build "
            "the wrong database. Pass `do_it_anyway=True` to do it anyway."
        )
    if source_dir is None:
        source_dir = config.RAW_DATA_DIR