diff --git a/tests/test_build.py b/tests/test_build.py
index 31c9c04..56b0977 100644
--- a/tests/test_build.py
+++ b/tests/test_build.py
@@ -26,7 +26,7 @@ def test_build():
     tempdir = tempfile.mkdtemp('.wordfreq')
     try:
         db_file = os.path.join(tempdir, 'test.db')
-        load_all_data(config.RAW_DATA_DIR, db_file)
+        load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
         conn = sqlite3.connect(db_file)
 
         # Compare the information we got to the information in the default DB.
@@ -43,3 +43,18 @@ def test_build():
             eq_(new_info[i], old_info[i])
     finally:
         shutil.rmtree(tempdir)
+
+
+def test_python2():
+    """
+    Python 2 got to skip two tests up there, because we built a slightly
+    wrong wordlist. Now let's test that, in normal operation, it will refuse
+    to build this wordlist.
+    """
+    if PYTHON2:
+        try:
+            load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
+            assert False, "The database should not have been built"
+        except UnicodeError:
+            # This is the correct case
+            pass
diff --git a/wordfreq/build.py b/wordfreq/build.py
index d5f1609..486b559 100644
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@@ -3,6 +3,7 @@ import sqlite3
 import codecs
 import re
 import os
+import sys
 import logging
 logger = logging.getLogger(__name__)
 
@@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
     """
     rows = [(listname, lang, word, freq)
             for word, freq in freqs.items()]
+    conn.execute('DELETE FROM words where wordlist=? and lang=?',
+                 (listname, lang))
     conn.executemany(
-        "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
+        "INSERT INTO words (wordlist, lang, word, freq) "
         "VALUES (?, ?, ?, ?)",
         rows
     )
@@ -135,10 +138,25 @@ def get_db_connection(filename):
 
 
 LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
-def load_all_data(source_dir=None, filename=None):
+def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
     """
     Load data from the raw data files into a SQLite database.
+
+    Python 3 has more complete Unicode support than Python 2, and this shows
+    up as actual differences in the set of words. For the sake of consistency,
+    we say that the data is only valid when built on Python 3.
+
+    Python 2 can still *use* wordfreq, by downloading the database that was
+    built on Python 3.
+
+    If you insist on building the Python 2 version, pass `do_it_anyway=True`.
     """
+    if sys.version_info.major == 2 and not do_it_anyway:
+        raise UnicodeError(
+            "Python 2.x has insufficient Unicode support, and will build "
+            "the wrong database. Pass `do_it_anyway=True` to do it anyway."
+        )
+
     if source_dir is None:
         source_dir = config.RAW_DATA_DIR