commit 709ca6be660a04282dcc02fc005d13842c9d256a Author: Robyn Speer Date: Mon Oct 28 19:26:44 2013 -0400 Initial version. Noticeably missing: data files or any way to get them. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..53d257f --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.pyc +*.swp +build +*.egg-info/ +dist +pip-log.txt +.coverage +*~ diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..5ab3d58 --- /dev/null +++ b/README.txt @@ -0,0 +1,3 @@ +Tools for working with word frequencies from various corpora. + +Author: Rob Speer diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..8c85548 --- /dev/null +++ b/setup.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +version_str = '0.1' + +from setuptools import setup + +classifiers=[ + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: MacOS', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX', + 'Operating System :: Unix', + 'Programming Language :: C', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Topic :: Scientific/Engineering', + 'Topic :: Software Development', + 'Topic :: Text Processing :: Linguistic',] + +import os +README_contents = open(os.path.join(os.path.dirname(__file__), 'README.txt')).read() +doclines = README_contents.split("\n") + +setup( + name="wordfreq", + version=version_str, + maintainer='Luminoso Technologies, Inc.', + maintainer_email='dev@luminoso.com', + url='http://github.com/LuminosoInsight/wordfreq/', + license = "MIT", + platforms = ["any"], + description = doclines[0], + classifiers = classifiers, + long_description = "\n".join(doclines[2:]), + packages=['wordfreq'], + package_data = {'wordfreq': ['data/wordlists/*.txt']}, + install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'], +) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordfreq/build.py b/wordfreq/build.py new file mode 100644 index 0000000..08902d3 --- /dev/null +++ b/wordfreq/build.py @@ -0,0 +1,163 @@ +from collections import defaultdict +import sqlite3 +import codecs +import re +import os +import logging + +from ftfy import ftfy +from wordfreq import config, schema + +logger = logging.getLogger(__name__) + + +def read_csv(filename): + """ + Load word frequencies from a file of comma-separated values, where + each line contains a term, a comma, and its frequency. + + Scale the frequencies so they add up to 1.0, and return them as a + dictionary. + """ + return _scale_freqs(_read_csv_basic(filename)) + + +def read_multilingual_csv(filename): + """ + Load word frequencies from a file of comma-separated values, where + each line is of the form: + + term|lang,freq + + Scale the frequencies so they add up to 1.0 *for each language*, + and return a dictionary from language -> (word -> freq). + """ + unscaled = defaultdict(dict) + raw_freqs = _read_csv_basic(filename) + for termlang in raw_freqs: + term, lang = termlang.rsplit('|', 1) + unscaled[lang][term] = raw_freqs[termlang] + + scaled = {} + for key in unscaled: + scaled[key] = _scale_freqs(unscaled[key]) + return scaled + + +def _read_csv_basic(filename): + infile = codecs.open(filename, encoding='utf-8') + + counts = {} + for line in infile: + line = line.rstrip(u'\n') + word, count = line.rsplit(u',', 1) + count = float(count) + counts[word] = count + return counts + + +NUMBER_RE = re.compile(u'[0-9]+') +def read_leeds_corpus(filename): + """ + Load word frequencies from a "Web as Corpus" file, collected and + provided by the University of Leeds. + + For more information, see: http://corpus.leeds.ac.uk/list.html + """ + infile = codecs.open(filename, encoding='utf-8') + + counts = defaultdict(float) + for line in infile: + line = line.rstrip() + if line: + rank = line.split(u' ')[0] + if NUMBER_RE.match(rank) and line.count(u' ') == 2: + _, freq, token = line.split(u' ') + token = ftfy(token).lower() + freq = float(freq) + counts[token] += freq + + return _scale_freqs(counts) + + +def _scale_freqs(counts): + """ + Take in unscaled word counts or frequencies, and scale them so that + they add up to 1.0. + """ + freqs = {} + total = sum(counts.values()) + for word in counts: + freqs[word] = counts[word] / total + + return freqs + + +def save_wordlist_to_db(conn, listname, lang, freqs): + rows = [(listname, lang, word, freq) + for word, freq in freqs.items()] + conn.executemany( + "INSERT OR REPLACE INTO words (wordlist, lang, word, freq) " + "VALUES (?, ?, ?, ?)", + rows + ) + conn.commit() + + +def create_db(conn, filename=config.DB_FILENAME): + """ + Create a wordlist database, at the filename specified by `wordfreq.config`. + + This should be safe to run (and have no effect) if the database already + exists. + """ + base_dir = os.path.dirname(filename) + if not os.path.exists(base_dir): + os.makedirs(base_dir) + + conn.execute(schema.SCHEMA) + for index_definition in schema.INDICES: + conn.execute(index_definition) + conn.commit() + + +LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh') +def load_all_data(source_dir=config.RAW_DATA_DIR): + conn = sqlite3.connect(config.DB_FILENAME) + logger.info("Creating database") + create_db(conn) + + for lang in LEEDS_LANGUAGES: + logger.info("Loading Leeds internet corpus: %s" % lang) + filename = os.path.join( + source_dir, 'leeds', 'internet-%s-forms.num' % lang + ) + wordlist = read_leeds_corpus(filename) + save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist) + + logger.info("Loading Google Books") + google_wordlist = read_csv( + os.path.join(source_dir, 'google', 'google-books-english.csv') + ) + save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist) + + logger.info("Loading combined multilingual corpus") + multi_wordlist = read_multilingual_csv( + os.path.join(source_dir, 'luminoso', 'multilingual.csv') + ) + for lang in multi_wordlist: + logger.info("\tLanguage: %s" % lang) + save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang]) + + logger.info("Loading Twitter corpus") + twitter_wordlist = read_csv( + os.path.join(source_dir, 'luminoso', 'twitter-52M.csv') + ) + save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist) + + logger.info("Done loading.") + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + load_all_data() diff --git a/wordfreq/config.py b/wordfreq/config.py new file mode 100644 index 0000000..b949c47 --- /dev/null +++ b/wordfreq/config.py @@ -0,0 +1,8 @@ +import os + +DB_DIR = (os.environ.get('WORDFREQ_DATA') + or os.path.expanduser('~/.cache/wordfreq')) +RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data') +VERSION = '0.1' +DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION) +LRU_SIZE = 100000 \ No newline at end of file diff --git a/wordfreq/query.py b/wordfreq/query.py new file mode 100644 index 0000000..fad80dc --- /dev/null +++ b/wordfreq/query.py @@ -0,0 +1,50 @@ +from wordfreq.config import DB_FILENAME, LRU_SIZE +from functools32 import lru_cache +import sqlite3 + +SQLITE_ERROR_TEXT = """ +Couldn't open the wordlist database. +You may need to run wordfreq's setup.py script. + +I was expecting to find the database at: + + %(path)s + +This can be configured by setting the WORDFREQ_DATA environment variable. +""" % {'path': DB_FILENAME} + +try: + CONN = sqlite3.connect(DB_FILENAME) +except sqlite3.OperationalError: + raise IOError(SQLITE_ERROR_TEXT) + + +@lru_cache(maxsize=LRU_SIZE) +def word_frequency(word, lang, wordlist='multi', default=0.): + """ + Get the frequency of `word` in the language with code `lang`, from the + specified `wordlist`. + + If the word doesn't appear in the wordlist, return the default value. + """ + c = CONN.cursor() + c.execute("SELECT freq from words where word=? and lang=? and wordlist=?", + (word, lang, wordlist)) + row = c.fetchone() + if row is None: + return default + else: + return row[0] + +# I'm sorry. +METANL_CONSTANT = 50291582140.06433 +def metanl_word_frequency(word, lang, default=0.): + """ + Return a word's frequency in a form that matches the output of + metanl 0.6. + """ + freq = word_frequency(word, lang, 'multi', None) + if freq is None: + return default + else: + return freq * METANL_CONSTANT diff --git a/wordfreq/schema.py b/wordfreq/schema.py new file mode 100644 index 0000000..6195802 --- /dev/null +++ b/wordfreq/schema.py @@ -0,0 +1,13 @@ +SCHEMA = """ +CREATE TABLE IF NOT EXISTS words ( + wordlist text, + lang text, + word text, + freq real +) +""" + +INDICES = [ + "CREATE UNIQUE INDEX IF NOT EXISTS words_uniq ON words (wordlist, lang, word)", + "CREATE INDEX IF NOT EXISTS words_by_freq ON words (wordlist, lang, freq DESC)", +]