mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Initial version.
Noticeably missing: data files or any way to get them.
This commit is contained in:
commit
e8273e47a1
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
*.pyc
|
||||
*.swp
|
||||
build
|
||||
*.egg-info/
|
||||
dist
|
||||
pip-log.txt
|
||||
.coverage
|
||||
*~
|
3
README.txt
Normal file
3
README.txt
Normal file
@ -0,0 +1,3 @@
|
||||
Tools for working with word frequencies from various corpora.
|
||||
|
||||
Author: Robyn Speer
|
43
setup.py
Executable file
43
setup.py
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
version_str = '0.1'
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
classifiers=[
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Natural Language :: English',
|
||||
'Operating System :: MacOS',
|
||||
'Operating System :: Microsoft :: Windows',
|
||||
'Operating System :: POSIX',
|
||||
'Operating System :: Unix',
|
||||
'Programming Language :: C',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Topic :: Scientific/Engineering',
|
||||
'Topic :: Software Development',
|
||||
'Topic :: Text Processing :: Linguistic',]
|
||||
|
||||
import os
|
||||
README_contents = open(os.path.join(os.path.dirname(__file__), 'README.txt')).read()
|
||||
doclines = README_contents.split("\n")
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version=version_str,
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='dev@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
license = "MIT",
|
||||
platforms = ["any"],
|
||||
description = doclines[0],
|
||||
classifiers = classifiers,
|
||||
long_description = "\n".join(doclines[2:]),
|
||||
packages=['wordfreq'],
|
||||
package_data = {'wordfreq': ['data/wordlists/*.txt']},
|
||||
install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
|
||||
)
|
0
wordfreq/__init__.py
Normal file
0
wordfreq/__init__.py
Normal file
163
wordfreq/build.py
Normal file
163
wordfreq/build.py
Normal file
@ -0,0 +1,163 @@
|
||||
from collections import defaultdict
|
||||
import sqlite3
|
||||
import codecs
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
|
||||
from ftfy import ftfy
|
||||
from wordfreq import config, schema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def read_csv(filename):
|
||||
"""
|
||||
Load word frequencies from a file of comma-separated values, where
|
||||
each line contains a term, a comma, and its frequency.
|
||||
|
||||
Scale the frequencies so they add up to 1.0, and return them as a
|
||||
dictionary.
|
||||
"""
|
||||
return _scale_freqs(_read_csv_basic(filename))
|
||||
|
||||
|
||||
def read_multilingual_csv(filename):
|
||||
"""
|
||||
Load word frequencies from a file of comma-separated values, where
|
||||
each line is of the form:
|
||||
|
||||
term|lang,freq
|
||||
|
||||
Scale the frequencies so they add up to 1.0 *for each language*,
|
||||
and return a dictionary from language -> (word -> freq).
|
||||
"""
|
||||
unscaled = defaultdict(dict)
|
||||
raw_freqs = _read_csv_basic(filename)
|
||||
for termlang in raw_freqs:
|
||||
term, lang = termlang.rsplit('|', 1)
|
||||
unscaled[lang][term] = raw_freqs[termlang]
|
||||
|
||||
scaled = {}
|
||||
for key in unscaled:
|
||||
scaled[key] = _scale_freqs(unscaled[key])
|
||||
return scaled
|
||||
|
||||
|
||||
def _read_csv_basic(filename):
|
||||
infile = codecs.open(filename, encoding='utf-8')
|
||||
|
||||
counts = {}
|
||||
for line in infile:
|
||||
line = line.rstrip(u'\n')
|
||||
word, count = line.rsplit(u',', 1)
|
||||
count = float(count)
|
||||
counts[word] = count
|
||||
return counts
|
||||
|
||||
|
||||
NUMBER_RE = re.compile(u'[0-9]+')
|
||||
def read_leeds_corpus(filename):
|
||||
"""
|
||||
Load word frequencies from a "Web as Corpus" file, collected and
|
||||
provided by the University of Leeds.
|
||||
|
||||
For more information, see: http://corpus.leeds.ac.uk/list.html
|
||||
"""
|
||||
infile = codecs.open(filename, encoding='utf-8')
|
||||
|
||||
counts = defaultdict(float)
|
||||
for line in infile:
|
||||
line = line.rstrip()
|
||||
if line:
|
||||
rank = line.split(u' ')[0]
|
||||
if NUMBER_RE.match(rank) and line.count(u' ') == 2:
|
||||
_, freq, token = line.split(u' ')
|
||||
token = ftfy(token).lower()
|
||||
freq = float(freq)
|
||||
counts[token] += freq
|
||||
|
||||
return _scale_freqs(counts)
|
||||
|
||||
|
||||
def _scale_freqs(counts):
|
||||
"""
|
||||
Take in unscaled word counts or frequencies, and scale them so that
|
||||
they add up to 1.0.
|
||||
"""
|
||||
freqs = {}
|
||||
total = sum(counts.values())
|
||||
for word in counts:
|
||||
freqs[word] = counts[word] / total
|
||||
|
||||
return freqs
|
||||
|
||||
|
||||
def save_wordlist_to_db(conn, listname, lang, freqs):
|
||||
rows = [(listname, lang, word, freq)
|
||||
for word, freq in freqs.items()]
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
|
||||
"VALUES (?, ?, ?, ?)",
|
||||
rows
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_db(conn, filename=config.DB_FILENAME):
|
||||
"""
|
||||
Create a wordlist database, at the filename specified by `wordfreq.config`.
|
||||
|
||||
This should be safe to run (and have no effect) if the database already
|
||||
exists.
|
||||
"""
|
||||
base_dir = os.path.dirname(filename)
|
||||
if not os.path.exists(base_dir):
|
||||
os.makedirs(base_dir)
|
||||
|
||||
conn.execute(schema.SCHEMA)
|
||||
for index_definition in schema.INDICES:
|
||||
conn.execute(index_definition)
|
||||
conn.commit()
|
||||
|
||||
|
||||
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
|
||||
def load_all_data(source_dir=config.RAW_DATA_DIR):
|
||||
conn = sqlite3.connect(config.DB_FILENAME)
|
||||
logger.info("Creating database")
|
||||
create_db(conn)
|
||||
|
||||
for lang in LEEDS_LANGUAGES:
|
||||
logger.info("Loading Leeds internet corpus: %s" % lang)
|
||||
filename = os.path.join(
|
||||
source_dir, 'leeds', 'internet-%s-forms.num' % lang
|
||||
)
|
||||
wordlist = read_leeds_corpus(filename)
|
||||
save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
|
||||
|
||||
logger.info("Loading Google Books")
|
||||
google_wordlist = read_csv(
|
||||
os.path.join(source_dir, 'google', 'google-books-english.csv')
|
||||
)
|
||||
save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
|
||||
|
||||
logger.info("Loading combined multilingual corpus")
|
||||
multi_wordlist = read_multilingual_csv(
|
||||
os.path.join(source_dir, 'luminoso', 'multilingual.csv')
|
||||
)
|
||||
for lang in multi_wordlist:
|
||||
logger.info("\tLanguage: %s" % lang)
|
||||
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
|
||||
|
||||
logger.info("Loading Twitter corpus")
|
||||
twitter_wordlist = read_csv(
|
||||
os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
|
||||
)
|
||||
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
|
||||
|
||||
logger.info("Done loading.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
load_all_data()
|
8
wordfreq/config.py
Normal file
8
wordfreq/config.py
Normal file
@ -0,0 +1,8 @@
|
||||
import os
|
||||
|
||||
DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
||||
or os.path.expanduser('~/.cache/wordfreq'))
|
||||
RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
|
||||
VERSION = '0.1'
|
||||
DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
|
||||
LRU_SIZE = 100000
|
50
wordfreq/query.py
Normal file
50
wordfreq/query.py
Normal file
@ -0,0 +1,50 @@
|
||||
from wordfreq.config import DB_FILENAME, LRU_SIZE
|
||||
from functools32 import lru_cache
|
||||
import sqlite3
|
||||
|
||||
SQLITE_ERROR_TEXT = """
|
||||
Couldn't open the wordlist database.
|
||||
You may need to run wordfreq's setup.py script.
|
||||
|
||||
I was expecting to find the database at:
|
||||
|
||||
%(path)s
|
||||
|
||||
This can be configured by setting the WORDFREQ_DATA environment variable.
|
||||
""" % {'path': DB_FILENAME}
|
||||
|
||||
try:
|
||||
CONN = sqlite3.connect(DB_FILENAME)
|
||||
except sqlite3.OperationalError:
|
||||
raise IOError(SQLITE_ERROR_TEXT)
|
||||
|
||||
|
||||
@lru_cache(maxsize=LRU_SIZE)
|
||||
def word_frequency(word, lang, wordlist='multi', default=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
specified `wordlist`.
|
||||
|
||||
If the word doesn't appear in the wordlist, return the default value.
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
|
||||
(word, lang, wordlist))
|
||||
row = c.fetchone()
|
||||
if row is None:
|
||||
return default
|
||||
else:
|
||||
return row[0]
|
||||
|
||||
# I'm sorry.
|
||||
METANL_CONSTANT = 50291582140.06433
|
||||
def metanl_word_frequency(word, lang, default=0.):
|
||||
"""
|
||||
Return a word's frequency in a form that matches the output of
|
||||
metanl 0.6.
|
||||
"""
|
||||
freq = word_frequency(word, lang, 'multi', None)
|
||||
if freq is None:
|
||||
return default
|
||||
else:
|
||||
return freq * METANL_CONSTANT
|
13
wordfreq/schema.py
Normal file
13
wordfreq/schema.py
Normal file
@ -0,0 +1,13 @@
|
||||
SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS words (
|
||||
wordlist text,
|
||||
lang text,
|
||||
word text,
|
||||
freq real
|
||||
)
|
||||
"""
|
||||
|
||||
INDICES = [
|
||||
"CREATE UNIQUE INDEX IF NOT EXISTS words_uniq ON words (wordlist, lang, word)",
|
||||
"CREATE INDEX IF NOT EXISTS words_by_freq ON words (wordlist, lang, freq DESC)",
|
||||
]
|
Loading…
Reference in New Issue
Block a user