Merge branch 'dutch-201504' into ftfy4

Conflicts: setup.py Former-commit-id: 24a7c73e6d
2024-12-24 18:01:38 +00:00 · 2015-05-05 12:04:44 -04:00 · 2015-05-05 12:04:44 -04:00 · d7ea4c420c
commit d7ea4c420c
parent 0cc89b1afa 732c932ac7
9 changed files with 44653 additions and 47 deletions
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -1,3 +1,4 @@
+from __future__ import unicode_literals
 from collections import defaultdict
 import sqlite3
 import codecs
@ -47,14 +48,15 @@ def _read_csv_basic(filename):

    counts = {}
    for line in infile:
-        line = line.rstrip(u'\n')
-        word, count = line.rsplit(u',', 1)
+        if ',' in line:
+            line = line.rstrip('\n')
+            word, count = line.rsplit(',', 1)
            count = float(count)
            counts[standardize_word(word)] = count
    return counts


-NUMBER_RE = re.compile(u'[0-9]+')
+NUMBER_RE = re.compile('[0-9]+')
 def read_leeds_corpus(filename):
    """
    Load word frequencies from a "Web as Corpus" file, collected and
@ -68,9 +70,9 @@ def read_leeds_corpus(filename):
    for line in infile:
        line = line.rstrip()
        if line:
-            rank = line.split(u' ')[0]
-            if NUMBER_RE.match(rank) and line.count(u' ') == 2:
-                _, freq, token = line.split(u' ')
+            rank = line.split(' ')[0]
+            if NUMBER_RE.match(rank) and line.count(' ') == 2:
+                _, freq, token = line.split(' ')
                token = standardize_word(ftfy(token))
                freq = float(freq)
                counts[token] += freq
@ -131,6 +133,24 @@ def get_db_connection(filename):
    return sqlite3.connect(filename)


+def read_leeds_wordlist_into_db(conn, filename, dbname, lang):
+    logger.info("Loading %r" % filename)
+    wordlist = read_leeds_corpus(filename)
+    save_wordlist_to_db(conn, dbname, lang, wordlist)
+
+
+def read_wordlist_into_db(conn, filename, dbname, lang='*'):
+    logger.info("Loading %r", filename)
+    if lang == '*':
+        multi_wordlist = read_multilingual_csv(filename)
+        for lang in multi_wordlist:
+            logger.info("\tLanguage: %s", lang)
+            save_wordlist_to_db(conn, dbname, lang, multi_wordlist[lang])
+    else:
+        wordlist = read_csv(filename)
+        save_wordlist_to_db(conn, dbname, lang, wordlist)
+
+
 LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
 def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
    """
@ -157,53 +177,29 @@ def load_all_data(source_dir=None, filename=None, do_it_anyway=False):
    if filename is None:
        filename = config.DB_FILENAME

+    def wordlist_path(*pieces):
+        return os.path.join(source_dir, *pieces)
+
    logger.info("Creating database")
    conn = create_db(filename)

-    logger.info("Loading Leeds internet corpus:")
    for lang in LEEDS_LANGUAGES:
-        logger.info("\tLanguage: %s" % lang)
-        filename = os.path.join(
-            source_dir, 'leeds', 'internet-%s-forms.num' % lang
-        )
-        wordlist = read_leeds_corpus(filename)
-        save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
+        filename = wordlist_path('leeds', 'internet-%s-forms.num' % lang)
+        read_leeds_wordlist_into_db(conn, filename, 'leeds-internet', lang)

-    logger.info("Loading Google Books (English).")
-    google_wordlist = read_csv(
-        os.path.join(source_dir, 'google', 'google-books-english.csv')
-    )
-    save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
+    read_wordlist_into_db(conn, wordlist_path('google', 'google-books-english.csv'), 'google-books', 'en')
+    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-52M.csv'), 'twitter', 'xx')
+    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-stems-2014.csv'), 'twitter-stems', '*')
+    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-surfaces-2014.csv'), 'twitter-surfaces', '*')

    logger.info("Loading combined multilingual corpus:")
-    multi_wordlist = read_multilingual_csv(
-        os.path.join(source_dir, 'luminoso', 'multilingual.csv')
-    )
+    multi_wordlist = read_multilingual_csv(wordlist_path('luminoso', 'multilingual.csv'))
    for lang in multi_wordlist:
        logger.info("\tLanguage: %s" % lang)
        save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])

-    logger.info("Loading Twitter corpus.")
-    twitter_wordlist = read_csv(
-        os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
-    )
-    save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
-
-    logger.info("Loading stemmed Twitter corpus.")
-    twitter_stems_wordlist = read_multilingual_csv(
-        os.path.join(source_dir, 'luminoso', 'twitter-stems-2014.csv')
-    )
-    for lang in twitter_stems_wordlist:
-        logger.info("\tLanguage: %s" % lang)
-        save_wordlist_to_db(conn, 'twitter-stems', lang, twitter_stems_wordlist[lang])
-
-    logger.info("Loading unstemmed Twitter corpus.")
-    twitter_surface_wordlist = read_multilingual_csv(
-        os.path.join(source_dir, 'luminoso', 'twitter-surfaces-2014.csv')
-    )
-    for lang in twitter_surface_wordlist:
-        logger.info("\tLanguage: %s" % lang)
-        save_wordlist_to_db(conn, 'twitter-surfaces', lang, twitter_surface_wordlist[lang])
+    # Load Dutch from a separate source. We may end up with more languages like this.
+    read_wordlist_into_db(conn, wordlist_path('luminoso', 'nl-combined-201504.csv'), 'surfaces', '*')

    logger.info("Done loading.")

--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
          or os.path.expanduser('~/.cache/wordfreq'))

 # When the minor version number increments, the data may change.
-VERSION = '0.5.0'
+VERSION = '0.7.0'
 MINOR_VERSION = '.'.join(VERSION.split('.')[:2])

 # Put these options together to make a database filename.
--- a/wordfreq_data/luminoso/nl-combined-201503.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/nl-combined-201503.csv.REMOVED.git-id
@ -0,0 +1 @@
+9b29de132c82bd7287c08c2937e3c4821525e356
--- a/wordfreq_data/luminoso/nl-combined-201504.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/nl-combined-201504.csv.REMOVED.git-id
@ -0,0 +1 @@
+956c3ff57edf5c45f3e850efd87a30d25c1b4bee
--- a/wordfreq_data/luminoso/twitter-stems-2014-nl.csv
+++ b/wordfreq_data/luminoso/twitter-stems-2014-nl.csv
--- a/wordfreq_data/luminoso/twitter-surfaces-2014-nl.csv
+++ b/wordfreq_data/luminoso/twitter-surfaces-2014-nl.csv
--- a/wordfreq_data/luminoso/twitter-surfaces-2014.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/twitter-surfaces-2014.csv.REMOVED.git-id
@ -1 +1 @@
-8ba8230ca42d8e9e622afee772b3a96c34126e23
+1e9d162c0c1333ce4a9afd79cd8686805f1e19c3
--- a/wordfreq_data/wikipedia/stems-nl.csv.REMOVED.git-id
+++ b/wordfreq_data/wikipedia/stems-nl.csv.REMOVED.git-id
@ -0,0 +1 @@
+b9d52d81bbe078a7de17519ed3494eb4771f0f69
--- a/wordfreq_data/wikipedia/surfaces-nl.csv.REMOVED.git-id
+++ b/wordfreq_data/wikipedia/surfaces-nl.csv.REMOVED.git-id
@ -0,0 +1 @@
+f69e13f6be1183f69166fe287ada38354ce4de99