v0.7: make a proper Dutch 'surfaces' list

2024-12-24 01:41:39 +00:00 · 2015-04-30 13:01:24 -04:00 · 2015-04-30 13:01:24 -04:00 · 873ace87db
commit 873ace87db
parent 6cf46ee5aa
4 changed files with 1295660 additions and 12 deletions
--- a/setup.py
+++ b/setup.py
@ -95,7 +95,7 @@ class CustomDevelopCommand(develop):
            self.run_command('download_db')
-requirements = ['ftfy >= 3']
+requirements = ['ftfy >= 3, < 4']
 if sys.version_info.major == 2:
    requirements.append('functools32')
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -1,3 +1,4 @@
 from __future__ import unicode_literals
 from collections import defaultdict
 import sqlite3
 import codecs
@ -47,14 +48,15 @@ def _read_csv_basic(filename):
    counts = {}
    for line in infile:
-        line = line.rstrip(u'\n')
+        if ',' in line:
-        word, count = line.rsplit(u',', 1)
+            line = line.rstrip('\n')
-        count = float(count)
+            word, count = line.rsplit(',', 1)
-        counts[standardize_word(word)] = count
+            count = float(count)
            counts[standardize_word(word)] = count
    return counts
-NUMBER_RE = re.compile(u'[0-9]+')
+NUMBER_RE = re.compile('[0-9]+')
 def read_leeds_corpus(filename):
    """
    Load word frequencies from a "Web as Corpus" file, collected and
@ -68,9 +70,9 @@ def read_leeds_corpus(filename):
    for line in infile:
        line = line.rstrip()
        if line:
-            rank = line.split(u' ')[0]
+            rank = line.split(' ')[0]
-            if NUMBER_RE.match(rank) and line.count(u' ') == 2:
+            if NUMBER_RE.match(rank) and line.count(' ') == 2:
-                _, freq, token = line.split(u' ')
+                _, freq, token = line.split(' ')
                token = standardize_word(ftfy(token))
                freq = float(freq)
                counts[token] += freq
@ -119,7 +121,7 @@ def create_db(filename):
        os.makedirs(base_dir)
    conn = get_db_connection(filename)
-    
+
    conn.execute(schema.SCHEMA)
    for index_definition in schema.INDICES:
        conn.execute(index_definition)
@ -197,7 +199,7 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
        save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
    # Load Dutch from a separate source. We may end up with more languages like this.
-    read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201503.csv'), 'stems', '*')
+    read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201504.csv'), 'surfaces', '*')
    logger.info("Done loading.")
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
          or os.path.expanduser('~/.cache/wordfreq'))
 # When the minor version number increments, the data may change.
-VERSION = '0.6.0'
+VERSION = '0.7.0'
 MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
 # Put these options together to make a database filename.
--- a/wordfreq_data/luminoso/nl-combined-201504.csv
+++ b/wordfreq_data/luminoso/nl-combined-201504.csv