mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
Clear wordlists before inserting them; yell at Python 2
This commit is contained in:
parent
5c8ba34492
commit
823b3828cd
@ -26,7 +26,7 @@ def test_build():
|
|||||||
tempdir = tempfile.mkdtemp('.wordfreq')
|
tempdir = tempfile.mkdtemp('.wordfreq')
|
||||||
try:
|
try:
|
||||||
db_file = os.path.join(tempdir, 'test.db')
|
db_file = os.path.join(tempdir, 'test.db')
|
||||||
load_all_data(config.RAW_DATA_DIR, db_file)
|
load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
|
||||||
conn = sqlite3.connect(db_file)
|
conn = sqlite3.connect(db_file)
|
||||||
|
|
||||||
# Compare the information we got to the information in the default DB.
|
# Compare the information we got to the information in the default DB.
|
||||||
@ -43,3 +43,18 @@ def test_build():
|
|||||||
eq_(new_info[i], old_info[i])
|
eq_(new_info[i], old_info[i])
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(tempdir)
|
shutil.rmtree(tempdir)
|
||||||
|
|
||||||
|
|
||||||
|
def test_python2():
|
||||||
|
"""
|
||||||
|
Python 2 got to skip two tests up there, because we built a slightly
|
||||||
|
wrong wordlist. Now let's test that, in normal operation, it will refuse
|
||||||
|
to build this wordlist.
|
||||||
|
"""
|
||||||
|
if PYTHON2:
|
||||||
|
try:
|
||||||
|
load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
|
||||||
|
assert False, "The database should not have been built"
|
||||||
|
except UnicodeError:
|
||||||
|
# This is the correct case
|
||||||
|
pass
|
||||||
|
@ -3,6 +3,7 @@ import sqlite3
|
|||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -104,8 +105,10 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
|
|||||||
"""
|
"""
|
||||||
rows = [(listname, lang, word, freq)
|
rows = [(listname, lang, word, freq)
|
||||||
for word, freq in freqs.items()]
|
for word, freq in freqs.items()]
|
||||||
|
conn.execute('DELETE FROM words where wordlist=? and lang=?',
|
||||||
|
(listname, lang))
|
||||||
conn.executemany(
|
conn.executemany(
|
||||||
"INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
|
"INSERT INTO words (wordlist, lang, word, freq) "
|
||||||
"VALUES (?, ?, ?, ?)",
|
"VALUES (?, ?, ?, ?)",
|
||||||
rows
|
rows
|
||||||
)
|
)
|
||||||
@ -135,10 +138,25 @@ def get_db_connection(filename):
|
|||||||
|
|
||||||
|
|
||||||
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
|
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
|
||||||
def load_all_data(source_dir=None, filename=None):
|
def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
|
||||||
"""
|
"""
|
||||||
Load data from the raw data files into a SQLite database.
|
Load data from the raw data files into a SQLite database.
|
||||||
|
|
||||||
|
Python 3 has more complete Unicode support than Python 2, and this shows
|
||||||
|
up as actual differences in the set of words. For the sake of consistency,
|
||||||
|
we say that the data is only valid when built on Python 3.
|
||||||
|
|
||||||
|
Python 2 can still *use* wordfreq, by downloading the database that was
|
||||||
|
built on Python 3.
|
||||||
|
|
||||||
|
If you insist on building the Python 2 version, pass `do_it_anyway=True`.
|
||||||
"""
|
"""
|
||||||
|
if sys.version_info.major == 2 and not do_it_anyway:
|
||||||
|
raise UnicodeError(
|
||||||
|
"Python 2.x has insufficient Unicode support, and will build "
|
||||||
|
"the wrong database. Pass `do_it_anyway=True` to do it anyway."
|
||||||
|
)
|
||||||
|
|
||||||
if source_dir is None:
|
if source_dir is None:
|
||||||
source_dir = config.RAW_DATA_DIR
|
source_dir = config.RAW_DATA_DIR
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user