Initial version.

Noticeably missing: data files or any way to get them.
This commit is contained in:
Rob Speer 2013-10-28 19:26:44 -04:00
commit 709ca6be66
8 changed files with 288 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*.pyc
*.swp
build
*.egg-info/
dist
pip-log.txt
.coverage
*~

3
README.txt Normal file
View File

@ -0,0 +1,3 @@
Tools for working with word frequencies from various corpora.
Author: Rob Speer

43
setup.py Executable file
View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
version_str = '0.1'
from setuptools import setup
classifiers=[
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Operating System :: MacOS',
'Operating System :: Microsoft :: Windows',
'Operating System :: POSIX',
'Operating System :: Unix',
'Programming Language :: C',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Topic :: Scientific/Engineering',
'Topic :: Software Development',
'Topic :: Text Processing :: Linguistic',]
import os
README_contents = open(os.path.join(os.path.dirname(__file__), 'README.txt')).read()
doclines = README_contents.split("\n")
setup(
name="wordfreq",
version=version_str,
maintainer='Luminoso Technologies, Inc.',
maintainer_email='dev@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
license = "MIT",
platforms = ["any"],
description = doclines[0],
classifiers = classifiers,
long_description = "\n".join(doclines[2:]),
packages=['wordfreq'],
package_data = {'wordfreq': ['data/wordlists/*.txt']},
install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
)

0
wordfreq/__init__.py Normal file
View File

163
wordfreq/build.py Normal file
View File

@ -0,0 +1,163 @@
from collections import defaultdict
import sqlite3
import codecs
import re
import os
import logging
from ftfy import ftfy
from wordfreq import config, schema
logger = logging.getLogger(__name__)
def read_csv(filename):
"""
Load word frequencies from a file of comma-separated values, where
each line contains a term, a comma, and its frequency.
Scale the frequencies so they add up to 1.0, and return them as a
dictionary.
"""
return _scale_freqs(_read_csv_basic(filename))
def read_multilingual_csv(filename):
"""
Load word frequencies from a file of comma-separated values, where
each line is of the form:
term|lang,freq
Scale the frequencies so they add up to 1.0 *for each language*,
and return a dictionary from language -> (word -> freq).
"""
unscaled = defaultdict(dict)
raw_freqs = _read_csv_basic(filename)
for termlang in raw_freqs:
term, lang = termlang.rsplit('|', 1)
unscaled[lang][term] = raw_freqs[termlang]
scaled = {}
for key in unscaled:
scaled[key] = _scale_freqs(unscaled[key])
return scaled
def _read_csv_basic(filename):
infile = codecs.open(filename, encoding='utf-8')
counts = {}
for line in infile:
line = line.rstrip(u'\n')
word, count = line.rsplit(u',', 1)
count = float(count)
counts[word] = count
return counts
NUMBER_RE = re.compile(u'[0-9]+')
def read_leeds_corpus(filename):
"""
Load word frequencies from a "Web as Corpus" file, collected and
provided by the University of Leeds.
For more information, see: http://corpus.leeds.ac.uk/list.html
"""
infile = codecs.open(filename, encoding='utf-8')
counts = defaultdict(float)
for line in infile:
line = line.rstrip()
if line:
rank = line.split(u' ')[0]
if NUMBER_RE.match(rank) and line.count(u' ') == 2:
_, freq, token = line.split(u' ')
token = ftfy(token).lower()
freq = float(freq)
counts[token] += freq
return _scale_freqs(counts)
def _scale_freqs(counts):
"""
Take in unscaled word counts or frequencies, and scale them so that
they add up to 1.0.
"""
freqs = {}
total = sum(counts.values())
for word in counts:
freqs[word] = counts[word] / total
return freqs
def save_wordlist_to_db(conn, listname, lang, freqs):
rows = [(listname, lang, word, freq)
for word, freq in freqs.items()]
conn.executemany(
"INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
"VALUES (?, ?, ?, ?)",
rows
)
conn.commit()
def create_db(conn, filename=config.DB_FILENAME):
"""
Create a wordlist database, at the filename specified by `wordfreq.config`.
This should be safe to run (and have no effect) if the database already
exists.
"""
base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
conn.execute(schema.SCHEMA)
for index_definition in schema.INDICES:
conn.execute(index_definition)
conn.commit()
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
def load_all_data(source_dir=config.RAW_DATA_DIR):
conn = sqlite3.connect(config.DB_FILENAME)
logger.info("Creating database")
create_db(conn)
for lang in LEEDS_LANGUAGES:
logger.info("Loading Leeds internet corpus: %s" % lang)
filename = os.path.join(
source_dir, 'leeds', 'internet-%s-forms.num' % lang
)
wordlist = read_leeds_corpus(filename)
save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
logger.info("Loading Google Books")
google_wordlist = read_csv(
os.path.join(source_dir, 'google', 'google-books-english.csv')
)
save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
logger.info("Loading combined multilingual corpus")
multi_wordlist = read_multilingual_csv(
os.path.join(source_dir, 'luminoso', 'multilingual.csv')
)
for lang in multi_wordlist:
logger.info("\tLanguage: %s" % lang)
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
logger.info("Loading Twitter corpus")
twitter_wordlist = read_csv(
os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
)
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
logger.info("Done loading.")
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
load_all_data()

8
wordfreq/config.py Normal file
View File

@ -0,0 +1,8 @@
import os
DB_DIR = (os.environ.get('WORDFREQ_DATA')
or os.path.expanduser('~/.cache/wordfreq'))
RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
VERSION = '0.1'
DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
LRU_SIZE = 100000

50
wordfreq/query.py Normal file
View File

@ -0,0 +1,50 @@
from wordfreq.config import DB_FILENAME, LRU_SIZE
from functools32 import lru_cache
import sqlite3
SQLITE_ERROR_TEXT = """
Couldn't open the wordlist database.
You may need to run wordfreq's setup.py script.
I was expecting to find the database at:
%(path)s
This can be configured by setting the WORDFREQ_DATA environment variable.
""" % {'path': DB_FILENAME}
try:
CONN = sqlite3.connect(DB_FILENAME)
except sqlite3.OperationalError:
raise IOError(SQLITE_ERROR_TEXT)
@lru_cache(maxsize=LRU_SIZE)
def word_frequency(word, lang, wordlist='multi', default=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`.
If the word doesn't appear in the wordlist, return the default value.
"""
c = CONN.cursor()
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
(word, lang, wordlist))
row = c.fetchone()
if row is None:
return default
else:
return row[0]
# I'm sorry.
METANL_CONSTANT = 50291582140.06433
def metanl_word_frequency(word, lang, default=0.):
"""
Return a word's frequency in a form that matches the output of
metanl 0.6.
"""
freq = word_frequency(word, lang, 'multi', None)
if freq is None:
return default
else:
return freq * METANL_CONSTANT

13
wordfreq/schema.py Normal file
View File

@ -0,0 +1,13 @@
SCHEMA = """
CREATE TABLE IF NOT EXISTS words (
wordlist text,
lang text,
word text,
freq real
)
"""
INDICES = [
"CREATE UNIQUE INDEX IF NOT EXISTS words_uniq ON words (wordlist, lang, word)",
"CREATE INDEX IF NOT EXISTS words_by_freq ON words (wordlist, lang, freq DESC)",
]