Clear wordlists before inserting them; yell at Python 2

2024-12-23 17:31:41 +00:00 · 2013-11-01 19:29:37 -04:00 · 2013-11-01 19:29:37 -04:00 · 823b3828cd
commit 823b3828cd
parent 5c8ba34492
2 changed files with 36 additions and 3 deletions
--- a/tests/test_build.py
+++ b/tests/test_build.py
@ -26,7 +26,7 @@ def test_build():
    tempdir = tempfile.mkdtemp('.wordfreq')
    try:
        db_file = os.path.join(tempdir, 'test.db')
-        load_all_data(config.RAW_DATA_DIR, db_file)
+        load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
        conn = sqlite3.connect(db_file)

        # Compare the information we got to the information in the default DB.
@ -43,3 +43,18 @@ def test_build():
            eq_(new_info[i], old_info[i])
    finally:
        shutil.rmtree(tempdir)
+
+
+def test_python2():
+    """
+    Python 2 got to skip two tests up there, because we built a slightly
+    wrong wordlist. Now let's test that, in normal operation, it will refuse
+    to build this wordlist.
+    """
+    if PYTHON2:
+        try:
+            load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
+            assert False, "The database should not have been built"
+        except UnicodeError:
+            # This is the correct case
+            pass
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -3,6 +3,7 @@ import sqlite3
 import codecs
 import re
 import os
+import sys
 import logging
 logger = logging.getLogger(__name__)

@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
    """
    rows = [(listname, lang, word, freq)
            for word, freq in freqs.items()]
+    conn.execute('DELETE FROM words where wordlist=? and lang=?',
+                 (listname, lang))
    conn.executemany(
-        "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
+        "INSERT INTO words (wordlist, lang, word, freq) "
        "VALUES (?, ?, ?, ?)",
        rows
    )
@ -135,10 +138,25 @@ def get_db_connection(filename):


 LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
-def load_all_data(source_dir=None, filename=None):
+def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
    """
    Load data from the raw data files into a SQLite database.
+
+    Python 3 has more complete Unicode support than Python 2, and this shows
+    up as actual differences in the set of words. For the sake of consistency,
+    we say that the data is only valid when built on Python 3.
+
+    Python 2 can still *use* wordfreq, by downloading the database that was
+    built on Python 3.
+
+    If you insist on building the Python 2 version, pass `do_it_anyway=True`.
    """
+    if sys.version_info.major == 2 and not do_it_anyway:
+        raise UnicodeError(
+            "Python 2.x has insufficient Unicode support, and will build "
+            "the wrong database. Pass `do_it_anyway=True` to do it anyway."
+        )
+
    if source_dir is None:
        source_dir = config.RAW_DATA_DIR