mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Initial version.
Noticeably missing: data files or any way to get them.
This commit is contained in:
commit
e8273e47a1
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
*.pyc
|
||||||
|
*.swp
|
||||||
|
build
|
||||||
|
*.egg-info/
|
||||||
|
dist
|
||||||
|
pip-log.txt
|
||||||
|
.coverage
|
||||||
|
*~
|
3
README.txt
Normal file
3
README.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
Tools for working with word frequencies from various corpora.
|
||||||
|
|
||||||
|
Author: Robyn Speer
|
43
setup.py
Executable file
43
setup.py
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
version_str = '0.1'
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
classifiers=[
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'Intended Audience :: Science/Research',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Natural Language :: English',
|
||||||
|
'Operating System :: MacOS',
|
||||||
|
'Operating System :: Microsoft :: Windows',
|
||||||
|
'Operating System :: POSIX',
|
||||||
|
'Operating System :: Unix',
|
||||||
|
'Programming Language :: C',
|
||||||
|
'Programming Language :: Python :: 2',
|
||||||
|
'Programming Language :: Python :: 2.7',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Programming Language :: Python :: 3.3',
|
||||||
|
'Topic :: Scientific/Engineering',
|
||||||
|
'Topic :: Software Development',
|
||||||
|
'Topic :: Text Processing :: Linguistic',]
|
||||||
|
|
||||||
|
import os
|
||||||
|
README_contents = open(os.path.join(os.path.dirname(__file__), 'README.txt')).read()
|
||||||
|
doclines = README_contents.split("\n")
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="wordfreq",
|
||||||
|
version=version_str,
|
||||||
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
|
maintainer_email='dev@luminoso.com',
|
||||||
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
license = "MIT",
|
||||||
|
platforms = ["any"],
|
||||||
|
description = doclines[0],
|
||||||
|
classifiers = classifiers,
|
||||||
|
long_description = "\n".join(doclines[2:]),
|
||||||
|
packages=['wordfreq'],
|
||||||
|
package_data = {'wordfreq': ['data/wordlists/*.txt']},
|
||||||
|
install_requires=['ftfy >= 3', 'functools32 == 3.2.3-1'],
|
||||||
|
)
|
0
wordfreq/__init__.py
Normal file
0
wordfreq/__init__.py
Normal file
163
wordfreq/build.py
Normal file
163
wordfreq/build.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
from collections import defaultdict
|
||||||
|
import sqlite3
|
||||||
|
import codecs
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ftfy import ftfy
|
||||||
|
from wordfreq import config, schema
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv(filename):
|
||||||
|
"""
|
||||||
|
Load word frequencies from a file of comma-separated values, where
|
||||||
|
each line contains a term, a comma, and its frequency.
|
||||||
|
|
||||||
|
Scale the frequencies so they add up to 1.0, and return them as a
|
||||||
|
dictionary.
|
||||||
|
"""
|
||||||
|
return _scale_freqs(_read_csv_basic(filename))
|
||||||
|
|
||||||
|
|
||||||
|
def read_multilingual_csv(filename):
|
||||||
|
"""
|
||||||
|
Load word frequencies from a file of comma-separated values, where
|
||||||
|
each line is of the form:
|
||||||
|
|
||||||
|
term|lang,freq
|
||||||
|
|
||||||
|
Scale the frequencies so they add up to 1.0 *for each language*,
|
||||||
|
and return a dictionary from language -> (word -> freq).
|
||||||
|
"""
|
||||||
|
unscaled = defaultdict(dict)
|
||||||
|
raw_freqs = _read_csv_basic(filename)
|
||||||
|
for termlang in raw_freqs:
|
||||||
|
term, lang = termlang.rsplit('|', 1)
|
||||||
|
unscaled[lang][term] = raw_freqs[termlang]
|
||||||
|
|
||||||
|
scaled = {}
|
||||||
|
for key in unscaled:
|
||||||
|
scaled[key] = _scale_freqs(unscaled[key])
|
||||||
|
return scaled
|
||||||
|
|
||||||
|
|
||||||
|
def _read_csv_basic(filename):
|
||||||
|
infile = codecs.open(filename, encoding='utf-8')
|
||||||
|
|
||||||
|
counts = {}
|
||||||
|
for line in infile:
|
||||||
|
line = line.rstrip(u'\n')
|
||||||
|
word, count = line.rsplit(u',', 1)
|
||||||
|
count = float(count)
|
||||||
|
counts[word] = count
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
NUMBER_RE = re.compile(u'[0-9]+')
|
||||||
|
def read_leeds_corpus(filename):
|
||||||
|
"""
|
||||||
|
Load word frequencies from a "Web as Corpus" file, collected and
|
||||||
|
provided by the University of Leeds.
|
||||||
|
|
||||||
|
For more information, see: http://corpus.leeds.ac.uk/list.html
|
||||||
|
"""
|
||||||
|
infile = codecs.open(filename, encoding='utf-8')
|
||||||
|
|
||||||
|
counts = defaultdict(float)
|
||||||
|
for line in infile:
|
||||||
|
line = line.rstrip()
|
||||||
|
if line:
|
||||||
|
rank = line.split(u' ')[0]
|
||||||
|
if NUMBER_RE.match(rank) and line.count(u' ') == 2:
|
||||||
|
_, freq, token = line.split(u' ')
|
||||||
|
token = ftfy(token).lower()
|
||||||
|
freq = float(freq)
|
||||||
|
counts[token] += freq
|
||||||
|
|
||||||
|
return _scale_freqs(counts)
|
||||||
|
|
||||||
|
|
||||||
|
def _scale_freqs(counts):
|
||||||
|
"""
|
||||||
|
Take in unscaled word counts or frequencies, and scale them so that
|
||||||
|
they add up to 1.0.
|
||||||
|
"""
|
||||||
|
freqs = {}
|
||||||
|
total = sum(counts.values())
|
||||||
|
for word in counts:
|
||||||
|
freqs[word] = counts[word] / total
|
||||||
|
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def save_wordlist_to_db(conn, listname, lang, freqs):
|
||||||
|
rows = [(listname, lang, word, freq)
|
||||||
|
for word, freq in freqs.items()]
|
||||||
|
conn.executemany(
|
||||||
|
"INSERT OR REPLACE INTO words (wordlist, lang, word, freq) "
|
||||||
|
"VALUES (?, ?, ?, ?)",
|
||||||
|
rows
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def create_db(conn, filename=config.DB_FILENAME):
|
||||||
|
"""
|
||||||
|
Create a wordlist database, at the filename specified by `wordfreq.config`.
|
||||||
|
|
||||||
|
This should be safe to run (and have no effect) if the database already
|
||||||
|
exists.
|
||||||
|
"""
|
||||||
|
base_dir = os.path.dirname(filename)
|
||||||
|
if not os.path.exists(base_dir):
|
||||||
|
os.makedirs(base_dir)
|
||||||
|
|
||||||
|
conn.execute(schema.SCHEMA)
|
||||||
|
for index_definition in schema.INDICES:
|
||||||
|
conn.execute(index_definition)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
LEEDS_LANGUAGES = ('ar', 'de', 'el', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh')
|
||||||
|
def load_all_data(source_dir=config.RAW_DATA_DIR):
|
||||||
|
conn = sqlite3.connect(config.DB_FILENAME)
|
||||||
|
logger.info("Creating database")
|
||||||
|
create_db(conn)
|
||||||
|
|
||||||
|
for lang in LEEDS_LANGUAGES:
|
||||||
|
logger.info("Loading Leeds internet corpus: %s" % lang)
|
||||||
|
filename = os.path.join(
|
||||||
|
source_dir, 'leeds', 'internet-%s-forms.num' % lang
|
||||||
|
)
|
||||||
|
wordlist = read_leeds_corpus(filename)
|
||||||
|
save_wordlist_to_db(conn, 'leeds-internet', lang, wordlist)
|
||||||
|
|
||||||
|
logger.info("Loading Google Books")
|
||||||
|
google_wordlist = read_csv(
|
||||||
|
os.path.join(source_dir, 'google', 'google-books-english.csv')
|
||||||
|
)
|
||||||
|
save_wordlist_to_db(conn, 'google-books', 'en', google_wordlist)
|
||||||
|
|
||||||
|
logger.info("Loading combined multilingual corpus")
|
||||||
|
multi_wordlist = read_multilingual_csv(
|
||||||
|
os.path.join(source_dir, 'luminoso', 'multilingual.csv')
|
||||||
|
)
|
||||||
|
for lang in multi_wordlist:
|
||||||
|
logger.info("\tLanguage: %s" % lang)
|
||||||
|
save_wordlist_to_db(conn, 'multi', lang, multi_wordlist[lang])
|
||||||
|
|
||||||
|
logger.info("Loading Twitter corpus")
|
||||||
|
twitter_wordlist = read_csv(
|
||||||
|
os.path.join(source_dir, 'luminoso', 'twitter-52M.csv')
|
||||||
|
)
|
||||||
|
save_wordlist_to_db(conn, 'twitter', 'xx', twitter_wordlist)
|
||||||
|
|
||||||
|
logger.info("Done loading.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
load_all_data()
|
8
wordfreq/config.py
Normal file
8
wordfreq/config.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
||||||
|
or os.path.expanduser('~/.cache/wordfreq'))
|
||||||
|
RAW_DATA_DIR = os.path.join(DB_DIR, 'raw_data')
|
||||||
|
VERSION = '0.1'
|
||||||
|
DB_FILENAME = os.path.join(DB_DIR, "words-%s.sqlite" % VERSION)
|
||||||
|
LRU_SIZE = 100000
|
50
wordfreq/query.py
Normal file
50
wordfreq/query.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from wordfreq.config import DB_FILENAME, LRU_SIZE
|
||||||
|
from functools32 import lru_cache
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
SQLITE_ERROR_TEXT = """
|
||||||
|
Couldn't open the wordlist database.
|
||||||
|
You may need to run wordfreq's setup.py script.
|
||||||
|
|
||||||
|
I was expecting to find the database at:
|
||||||
|
|
||||||
|
%(path)s
|
||||||
|
|
||||||
|
This can be configured by setting the WORDFREQ_DATA environment variable.
|
||||||
|
""" % {'path': DB_FILENAME}
|
||||||
|
|
||||||
|
try:
|
||||||
|
CONN = sqlite3.connect(DB_FILENAME)
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
raise IOError(SQLITE_ERROR_TEXT)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=LRU_SIZE)
|
||||||
|
def word_frequency(word, lang, wordlist='multi', default=0.):
|
||||||
|
"""
|
||||||
|
Get the frequency of `word` in the language with code `lang`, from the
|
||||||
|
specified `wordlist`.
|
||||||
|
|
||||||
|
If the word doesn't appear in the wordlist, return the default value.
|
||||||
|
"""
|
||||||
|
c = CONN.cursor()
|
||||||
|
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
|
||||||
|
(word, lang, wordlist))
|
||||||
|
row = c.fetchone()
|
||||||
|
if row is None:
|
||||||
|
return default
|
||||||
|
else:
|
||||||
|
return row[0]
|
||||||
|
|
||||||
|
# I'm sorry.
|
||||||
|
METANL_CONSTANT = 50291582140.06433
|
||||||
|
def metanl_word_frequency(word, lang, default=0.):
|
||||||
|
"""
|
||||||
|
Return a word's frequency in a form that matches the output of
|
||||||
|
metanl 0.6.
|
||||||
|
"""
|
||||||
|
freq = word_frequency(word, lang, 'multi', None)
|
||||||
|
if freq is None:
|
||||||
|
return default
|
||||||
|
else:
|
||||||
|
return freq * METANL_CONSTANT
|
13
wordfreq/schema.py
Normal file
13
wordfreq/schema.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
SCHEMA = """
|
||||||
|
CREATE TABLE IF NOT EXISTS words (
|
||||||
|
wordlist text,
|
||||||
|
lang text,
|
||||||
|
word text,
|
||||||
|
freq real
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
INDICES = [
|
||||||
|
"CREATE UNIQUE INDEX IF NOT EXISTS words_uniq ON words (wordlist, lang, word)",
|
||||||
|
"CREATE INDEX IF NOT EXISTS words_by_freq ON words (wordlist, lang, freq DESC)",
|
||||||
|
]
|
Loading…
Reference in New Issue
Block a user