Initial version.

Noticeably missing: data files or any way to get them.
2024-12-23 17:31:41 +00:00 · 2013-10-28 19:26:44 -04:00 · 2013-10-28 19:26:44 -04:00 · 709ca6be66
commit 709ca6be66
8 changed files with 288 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+*.pyc
+*.swp
+build
+*.egg-info/
+dist
+pip-log.txt
+.coverage
+*~
--- a/README.txt
+++ b/README.txt
@ -0,0 +1,3 @@
+Tools for working with word frequencies from various corpora.
+
+Author: Rob Speer
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+version_str = '0.1'
+
+from setuptools import setup
+
+classifiers=[
+    'Intended Audience :: Developers',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: MIT License',
+    'Natural Language :: English',
+    'Operating System :: MacOS',
+    'Operating System :: Microsoft :: Windows',
+    'Operating System :: POSIX',
+    'Operating System :: Unix',
+    'Programming Language :: C',
+    'Programming Language :: Python :: 2',
+    'Programming Language :: Python :: 2.7',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.3',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Software Development',
+    'Topic :: Text Processing :: Linguistic',]
+
+import os
+README_contents = open(os.path.join(os.path.dirname(__file__), 'README.txt')).read()
+doclines = README_contents.split("\n")
+
+setup(
+    name="wordfreq",
+    version=version_str,
+    maintainer='Luminoso Technologies, Inc.',
+    maintainer_email='dev@luminoso.com',
+    url='http://github.com/LuminosoInsight/wordfreq/',
+    license = "MIT",
+    platforms = ["any"],
+    description = doclines[0],
+    classifiers = classifiers,
+    long_description = "\n".join(doclines[2:]),
+    packages=['wordfreq'],
+    package_data = {'wordfreq': ['data/wordlists/*.txt']},
+    install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
+)
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -0,0 +1,163 @@
+from collections import defaultdict
+import sqlite3
+import codecs
+import re
+import os
+import logging
+
+from ftfy import ftfy
+from wordfreq import config, schema
+
+logger = logging.getLogger(__name__)
+
+
+def read_csv(filename):
+    """
+    Load word frequencies from a file of comma-separated values, where
+    each line contains a term, a comma, and its frequency.
+
+    Scale the frequencies so they add up to 1.0, and return them as a
+    dictionary.
+    """
+    return _scale_freqs(_read_csv_basic(filename))
+
+
+def read_multilingual_csv(filename):
+    """
+    Load word frequencies from a file of comma-separated values, where
+    each line is of the form:
+
+        term|lang,freq
+
+    Scale the frequencies so they add up to 1.0 *for each language*,
+    and return a dictionary from language -> (word -> freq).
+    """
+    unscaled = defaultdict(dict)
+    raw_freqs = _read_csv_basic(filename)
+    for termlang in raw_freqs:
+        term, lang = termlang.rsplit('|', 1)
+        unscaled[lang][term] = raw_freqs[termlang]
+
+    scaled = {}
+    for key in unscaled:
+        scaled[key] = _scale_freqs(unscaled[key])
+    return scaled
+
+
+def _read_csv_basic(filename):
+    infile = codecs.open(filename, encoding='utf-8')
+
+    counts = {}
+    for line in infile:
+        line = line.rstrip(u'\n')
+        word, count = line.rsplit(u',', 1)
+        count = float(count)
+        counts[word] = count
+    return counts
+
+
+NUMBER_RE = re.compile(u'[0-9]+')
+def read_leeds_corpus(filename):
+    """
+    Load word frequencies from a "Web as Corpus" file, collected and
+    provided by the University of Leeds.
+
+    For more information, see: http://corpus.leeds.ac.uk/list.html
+    """
+    infile = codecs.open(filename, encoding='utf-8')
+
+    counts = defaultdict(float)
+    for line in infile:
+        line = line.rstrip()
+        if line:
+            rank = line.split(u' ')[0]
+            if NUMBER_RE.match(rank) and line.count(u' ') == 2:
+                _, freq, token = line.split(u' ')
+                token = ftfy(token).lower()
+                freq = float(freq)
+                counts[token] += freq
+
+    return _scale_freqs(counts)
+
+
+def _scale_freqs(counts):
+    """
+    Take in unscaled word counts or frequencies, and scale them so that
+    they add up to 1.0.
+    """
+    freqs = {}
+    total = sum(counts.values())
+    for word in counts:
+        freqs[word] = counts[word] / total
+
+    return freqs
+
+
+def save_wordlist_to_db(conn, listname, lang, freqs):
+    rows = [(listname, lang, word, freq)
+            for word, freq in freqs.items()]
+    conn.executemany(
+        "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
+        "VALUES (?, ?, ?, ?)",
+        rows
+    )
+    conn.commit()
+
+
+def create_db(conn, filename=config.DB_FILENAME):
+    """
+    Create a wordlist database, at the filename specified by `wordfreq.config`.
+
+    This should be safe to run (and have no effect) if the database already
+    exists.
+    """
+    base_dir = os.path.dirname(filename)
+    if not os.path.exists(base_dir):
+        os.makedirs(base_dir)
+
+    conn.execute(schema.SCHEMA)
+    for index_definition in schema.INDICES:
+        conn.execute(index_definition)
+    conn.commit()
+
+
+LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
+def load_all_data(source_dir=config.RAW_DATA_DIR):
+    conn = sqlite3.connect(config.DB_FILENAME)
+    logger.info("Creating database")
+    create_db(conn)
+
+    for lang in LEEDS_LANGUAGES:
+        logger.info("Loading Leeds internet corpus: %s" % lang)
+        filename = os.path.join(
+            source_dir, 'leeds', 'internet-%s-forms.num' % lang
+        )
+        wordlist = read_leeds_corpus(filename)
+        save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
+
+    logger.info("Loading Google Books")
+    google_wordlist = read_csv(
+        os.path.join(source_dir, 'google', 'google-books-english.csv')
+    )
+    save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
+
+    logger.info("Loading combined multilingual corpus")
+    multi_wordlist = read_multilingual_csv(
+        os.path.join(source_dir, 'luminoso', 'multilingual.csv')
+    )
+    for lang in multi_wordlist:
+        logger.info("\tLanguage: %s" % lang)
+        save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
+
+    logger.info("Loading Twitter corpus")
+    twitter_wordlist = read_csv(
+        os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
+    )
+    save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
+
+    logger.info("Done loading.")
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    load_all_data()
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@ -0,0 +1,8 @@
+import os
+
+DB_DIR = (os.environ.get('WORDFREQ_DATA')
+          or os.path.expanduser('~/.cache/wordfreq'))
+RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
+VERSION = '0.1'
+DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
+LRU_SIZE = 100000
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@ -0,0 +1,50 @@
+from wordfreq.config import DB_FILENAME, LRU_SIZE
+from functools32 import lru_cache
+import sqlite3
+
+SQLITE_ERROR_TEXT = """
+Couldn't open the wordlist database.
+You may need to run wordfreq's setup.py script.
+
+I was expecting to find the database at:
+
+    %(path)s
+
+This can be configured by setting the WORDFREQ_DATA environment variable.
+""" % {'path': DB_FILENAME}
+
+try:
+    CONN = sqlite3.connect(DB_FILENAME)
+except sqlite3.OperationalError:
+    raise IOError(SQLITE_ERROR_TEXT)
+
+
+@lru_cache(maxsize=LRU_SIZE)
+def word_frequency(word, lang, wordlist='multi', default=0.):
+    """
+    Get the frequency of `word` in the language with code `lang`, from the
+    specified `wordlist`.
+
+    If the word doesn't appear in the wordlist, return the default value.
+    """
+    c = CONN.cursor()
+    c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
+              (word, lang, wordlist))
+    row = c.fetchone()
+    if row is None:
+        return default
+    else:
+        return row[0]
+
+# I'm sorry.
+METANL_CONSTANT = 50291582140.06433
+def metanl_word_frequency(word, lang, default=0.):
+    """
+    Return a word's frequency in a form that matches the output of
+    metanl 0.6.
+    """
+    freq = word_frequency(word, lang, 'multi', None)
+    if freq is None:
+        return default
+    else:
+        return freq * METANL_CONSTANT
--- a/wordfreq/schema.py
+++ b/wordfreq/schema.py
@ -0,0 +1,13 @@
+SCHEMA = """
+CREATE TABLE IF NOT EXISTS words (
+    wordlist text,
+    lang text,
+    word text,
+    freq real
+)
+"""
+
+INDICES = [
+    "CREATE UNIQUE INDEX IF NOT EXISTS words_uniq ON words (wordlist, lang, word)",
+    "CREATE INDEX IF NOT EXISTS words_by_freq ON words (wordlist, lang, freq DESC)",
+]