Initial version.

Noticeably missing: data files or any way to get them.
2024-12-23 17:31:41 +00:00 · 2013-10-28 19:26:44 -04:00 · 2013-10-28 19:26:44 -04:00 · e8273e47a1
commit e8273e47a1
8 changed files with 288 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
 *.pyc
 *.swp
 build
 *.egg-info/
 dist
 pip-log.txt
 .coverage
 *~
--- a/README.txt
+++ b/README.txt
@ -0,0 +1,3 @@
 Tools for working with word frequencies from various corpora.
 Author: Robyn Speer
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,43 @@
 #!/usr/bin/env python
 version_str = '0.1'
 from setuptools import setup
 classifiers=[
    'Intended Audience :: Developers',
    'Intended Audience :: Science/Research',
    'License :: OSI Approved :: MIT License',
    'Natural Language :: English',
    'Operating System :: MacOS',
    'Operating System :: Microsoft :: Windows',
    'Operating System :: POSIX',
    'Operating System :: Unix',
    'Programming Language :: C',
    'Programming Language :: Python :: 2',
    'Programming Language :: Python :: 2.7',
    'Programming Language :: Python :: 3',
    'Programming Language :: Python :: 3.3',
    'Topic :: Scientific/Engineering',
    'Topic :: Software Development',
    'Topic :: Text Processing :: Linguistic',]
 import os
 README_contents = open(os.path.join(os.path.dirname(__file__), 'README.txt')).read()
 doclines = README_contents.split("\n")
 setup(
    name="wordfreq",
    version=version_str,
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='dev@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
    license = "MIT",
    platforms = ["any"],
    description = doclines[0],
    classifiers = classifiers,
    long_description = "\n".join(doclines[2:]),
    packages=['wordfreq'],
    package_data = {'wordfreq': ['data/wordlists/*.txt']},
    install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
 )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -0,0 +1,163 @@
 from collections import defaultdict
 import sqlite3
 import codecs
 import re
 import os
 import logging
 from ftfy import ftfy
 from wordfreq import config, schema
 logger = logging.getLogger(__name__)
 def read_csv(filename):
    """
    Load word frequencies from a file of comma-separated values, where
    each line contains a term, a comma, and its frequency.
    Scale the frequencies so they add up to 1.0, and return them as a
    dictionary.
    """
    return _scale_freqs(_read_csv_basic(filename))
 def read_multilingual_csv(filename):
    """
    Load word frequencies from a file of comma-separated values, where
    each line is of the form:
        term|lang,freq
    Scale the frequencies so they add up to 1.0 *for each language*,
    and return a dictionary from language -> (word -> freq).
    """
    unscaled = defaultdict(dict)
    raw_freqs = _read_csv_basic(filename)
    for termlang in raw_freqs:
        term, lang = termlang.rsplit('|', 1)
        unscaled[lang][term] = raw_freqs[termlang]
    scaled = {}
    for key in unscaled:
        scaled[key] = _scale_freqs(unscaled[key])
    return scaled
 def _read_csv_basic(filename):
    infile = codecs.open(filename, encoding='utf-8')
    counts = {}
    for line in infile:
        line = line.rstrip(u'\n')
        word, count = line.rsplit(u',', 1)
        count = float(count)
        counts[word] = count
    return counts
 NUMBER_RE = re.compile(u'[0-9]+')
 def read_leeds_corpus(filename):
    """
    Load word frequencies from a "Web as Corpus" file, collected and
    provided by the University of Leeds.
    For more information, see: http://corpus.leeds.ac.uk/list.html
    """
    infile = codecs.open(filename, encoding='utf-8')
    counts = defaultdict(float)
    for line in infile:
        line = line.rstrip()
        if line:
            rank = line.split(u' ')[0]
            if NUMBER_RE.match(rank) and line.count(u' ') == 2:
                _, freq, token = line.split(u' ')
                token = ftfy(token).lower()
                freq = float(freq)
                counts[token] += freq
    return _scale_freqs(counts)
 def _scale_freqs(counts):
    """
    Take in unscaled word counts or frequencies, and scale them so that
    they add up to 1.0.
    """
    freqs = {}
    total = sum(counts.values())
    for word in counts:
        freqs[word] = counts[word] / total
    return freqs
 def save_wordlist_to_db(conn, listname, lang, freqs):
    rows = [(listname, lang, word, freq)
            for word, freq in freqs.items()]
    conn.executemany(
        "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
        "VALUES (?, ?, ?, ?)",
        rows
    )
    conn.commit()
 def create_db(conn, filename=config.DB_FILENAME):
    """
    Create a wordlist database, at the filename specified by `wordfreq.config`.
    This should be safe to run (and have no effect) if the database already
    exists.
    """
    base_dir = os.path.dirname(filename)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    conn.execute(schema.SCHEMA)
    for index_definition in schema.INDICES:
        conn.execute(index_definition)
    conn.commit()
 LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
 def load_all_data(source_dir=config.RAW_DATA_DIR):
    conn = sqlite3.connect(config.DB_FILENAME)
    logger.info("Creating database")
    create_db(conn)
    for lang in LEEDS_LANGUAGES:
        logger.info("Loading Leeds internet corpus: %s" % lang)
        filename = os.path.join(
            source_dir, 'leeds', 'internet-%s-forms.num' % lang
        )
        wordlist = read_leeds_corpus(filename)
        save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
    logger.info("Loading Google Books")
    google_wordlist = read_csv(
        os.path.join(source_dir, 'google', 'google-books-english.csv')
    )
    save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
    logger.info("Loading combined multilingual corpus")
    multi_wordlist = read_multilingual_csv(
        os.path.join(source_dir, 'luminoso', 'multilingual.csv')
    )
    for lang in multi_wordlist:
        logger.info("\tLanguage: %s" % lang)
        save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
    logger.info("Loading Twitter corpus")
    twitter_wordlist = read_csv(
        os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
    )
    save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
    logger.info("Done loading.")
 if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    load_all_data()
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@ -0,0 +1,8 @@
 import os
 DB_DIR = (os.environ.get('WORDFREQ_DATA')
          or os.path.expanduser('~/.cache/wordfreq'))
 RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
 VERSION = '0.1'
 DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
 LRU_SIZE = 100000
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@ -0,0 +1,50 @@
 from wordfreq.config import DB_FILENAME, LRU_SIZE
 from functools32 import lru_cache
 import sqlite3
 SQLITE_ERROR_TEXT = """
 Couldn't open the wordlist database.
 You may need to run wordfreq's setup.py script.
 I was expecting to find the database at:
    %(path)s
 This can be configured by setting the WORDFREQ_DATA environment variable.
 """ % {'path': DB_FILENAME}
 try:
    CONN = sqlite3.connect(DB_FILENAME)
 except sqlite3.OperationalError:
    raise IOError(SQLITE_ERROR_TEXT)
@lru_cache(maxsize=LRU_SIZE)
 def word_frequency(word, lang, wordlist='multi', default=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
    specified `wordlist`.
    If the word doesn't appear in the wordlist, return the default value.
    """
    c = CONN.cursor()
    c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
              (word, lang, wordlist))
    row = c.fetchone()
    if row is None:
        return default
    else:
        return row[0]
 # I'm sorry.
 METANL_CONSTANT = 50291582140.06433
 def metanl_word_frequency(word, lang, default=0.):
    """
    Return a word's frequency in a form that matches the output of
    metanl 0.6.
    """
    freq = word_frequency(word, lang, 'multi', None)
    if freq is None:
        return default
    else:
        return freq * METANL_CONSTANT
--- a/wordfreq/schema.py
+++ b/wordfreq/schema.py
@ -0,0 +1,13 @@
 SCHEMA = """
 CREATE TABLE IF NOT EXISTS words (
    wordlist text,
    lang text,
    word text,
    freq real
 )
 """
 INDICES = [
    "CREATE UNIQUE INDEX IF NOT EXISTS words_uniq ON words (wordlist, lang, word)",
    "CREATE INDEX IF NOT EXISTS words_by_freq ON words (wordlist, lang, freq DESC)",
 ]
		`@ -0,0 +1,3 @@`
							`Tools for working with word frequencies from various corpora.`

							`Author: Robyn Speer`