add new data files from wordfreq_builder

Former-commit-id: 35aec061de
This commit is contained in:
Robyn Speer 2015-05-11 18:45:47 -04:00
parent f92598b13d
commit aa0e844b81
45 changed files with 204 additions and 639859 deletions

2
MANIFEST.in Normal file
View File

@ -0,0 +1,2 @@
recursive-include wordfreq/data *.gz
include README.md

View File

@ -33,5 +33,6 @@ setup(
classifiers = classifiers,
long_description = "\n".join(doclines[2:]),
packages=['wordfreq'],
include_package_data=True,
install_requires=['ftfy >= 4', 'msgpack-python'],
)

View File

@ -1,16 +1,208 @@
# Make wordfreq.query available at the top level when needed.
from pkg_resources import resource_filename
from functools import lru_cache
import langcodes
import msgpack
import re
import gzip
import pathlib
import random
import logging
logger = logging.getLogger(__name__)
def word_frequency(word, lang, wordlist='multi', offset=0.):
NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff---\uff66-\U0002ffff]'
NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE))
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
CACHE_SIZE = 100000
def tokenize(text):
"""
A simple tokenizer that can be applied to most languages. Strings that
are looked up in wordfreq will be run through this tokenizer first,
so that they can be expected to match the data.
"""
return [token.lower() for token in TOKEN_RE.findall(text)]
def read_dBpack(filename):
"""
Read a file from an idiosyncratic format that we use for storing
approximate word frequencies, called "dBpack".
The dBpack format is as follows:
- The file on disk is a gzipped file in msgpack format, which decodes to a
list of lists of words.
- Each inner list of words corresponds to a particular word frequency,
rounded to the nearest decibel. 0 dB represents a word that occurs with
probability 1, so it is the only word in the data (this of course doesn't
happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB
represents a word that occurs once per 1000 tokens, and so on.
- The index of each list within the overall list is the negative of its
frequency in decibels.
- Each inner list is sorted in alphabetical order.
As an example, consider a corpus consisting only of the words "red fish
blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red"
and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word
frequencies would decode to this list:
[[], [], [], ['fish'], [], [], ['blue', 'red']]
"""
with gzip.open(filename, 'rb') as infile:
got = msgpack.load(infile, encoding='utf-8')
return got
def available_languages(wordlist='combined'):
"""
List the languages (as language-code strings) that the wordlist of a given
name is available in.
"""
available = {}
for path in DATA_PATH.glob('*.msgpack.gz'):
list_name = path.name.split('.')[0]
name, lang = list_name.split('_')
if name == wordlist:
available[lang] = path
return available
@lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='combined', match_cutoff=50):
"""
Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_dBpack` for what this represents.)
Because we use the `langcodes` module, we can handle slight
variations in language codes. For example, looking for 'pt-BR',
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
Looking up the alternate code 'por' will also get the same list.
"""
available = available_languages(wordlist)
best, score = langcodes.best_match(lang, list(available),
min_score=match_cutoff)
if score == 0:
raise LookupError("No wordlist available for language %r" % lang)
# Convert the LanguageData object to a normalized language code
got = str(best)
if got != lang:
logger.warning(
"You asked for word frequencies in language %r. Using the "
"nearest match, which is %r (%s)."
% (lang, best.language_name('en'))
)
filepath = available[str(best)]
return read_dBpack(str(filepath))
def dB_to_freq(dB):
if dB > 0:
raise ValueError(
"A frequency cannot be a positive number of decibels."
)
return 10 ** (dB / 10)
@lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='combined', match_cutoff=50):
"""
Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities.
"""
freqs = {}
pack = get_frequency_list(lang, wordlist, match_cutoff)
for index, bucket in enumerate(pack):
for word in bucket:
freqs[word] = dB_to_freq(-index)
return freqs
def iter_wordlist(lang, wordlist='combined'):
"""
Yield the words in a wordlist in approximate descending order of
frequency.
Because wordfreq rounds off its frequencies, the words will form 'bands'
with the same rounded frequency, appearing in alphabetical order within
each band.
"""
for sublist in get_frequency_list(lang, wordlist):
for word in sublist:
yield word
@lru_cache(maxsize=CACHE_SIZE)
def word_frequency(word, lang, wordlist='combined', default=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`.
specified `wordlist`. The default (and currently only) wordlist is
'combined', built from whichever of these four sources have sufficient
data for the language:
The offset gets added to all values, to monotonically account for the
fact that we have not observed all possible words.
- Full text of Wikipedia
- A sample of 72 million tweets collected from Twitter in 2014,
divided roughly into languages using automatic language detection
- Frequencies extracted from OpenSubtitles
- The Leeds Internet Corpus
This is a wrapper for the real word_frequency function, so that it can
be imported at the top level instead of from `wordfreq.query`.
Words that we believe occur at least once per million tokens, based on
the average of these lists, will appear in the word frequency list.
If you look up a word that's not in the list, you'll get the `default`
value, which itself defaults to 0.
If a word decomposes into multiple tokens, we'll return a smoothed estimate
of the word frequency that is no greater than the frequency of any of its
individual tokens.
"""
from wordfreq.query import word_frequency as _real_word_frequency
return _real_word_frequency(word, lang, wordlist=wordlist, offset=offset)
freqs = get_frequency_dict(lang, wordlist)
combined_value = None
for token in tokenize(word):
if token not in freqs:
# If any word is missing, just return the default value
return default
value = freqs[token]
if combined_value is None:
combined_value = value
else:
# Combine word values using the half-harmonic-mean formula,
# (a * b) / (a + b). This operation is associative.
combined_value = (combined_value * value) / (combined_value + value)
return combined_value
@lru_cache(maxsize=100)
def top_n_list(lang, n, wordlist='combined', ascii_only=False):
results = []
for word in iter_wordlist(lang, wordlist):
if (not ascii_only) or max(word) <= '~':
results.append(word)
if len(results) >= n:
break
return results
def random_words(nwords=4, lang='en', wordlist='combined', bits_per_word=12,
ascii_only=False):
n_choices = 2 ** bits_per_word
choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only)
if len(choices) < n_choices:
raise ValueError(
"There aren't enough words in the wordlist to provide %d bits of "
"entropy per word." % bits_per_word
)
selected = [random.choice(choices) for i in range(nwords)]
return ' '.join(selected)
def random_ascii_words(nwords=4, lang='en', wordlist='combined',
bits_per_word=12):
return random_words(nwords, lang, wordlist, bits_per_word, ascii_only=True)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,128 +0,0 @@
from wordfreq.config import DB_FILENAME, CACHE_SIZE
from functools import lru_cache
@lru_cache(maxsize=CACHE_SIZE)
def word_frequency(word, lang, wordlist='multi', offset=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`.
The offset gets added to all values, to monotonically account for the
fact that we have not observed all possible words.
"""
c = CONN.cursor()
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
(standardize_word(word), lang, wordlist))
row = c.fetchone()
if row is None:
return offset
else:
return row[0] + offset
def wordlist_size(wordlist, lang=None):
"""
Get the number of words in a wordlist.
"""
c = CONN.cursor()
if lang is None:
c.execute(
"SELECT count(*) from words where wordlist=?",
(wordlist,)
)
else:
c.execute(
"SELECT count(*) from words where wordlist=? and lang=?",
(wordlist, lang)
)
return c.fetchone()[0]
def average_frequency(wordlist, lang):
"""
A kind of slow function to get the average frequency for words in a
wordlist.
If, for example, you're smoothing over word frequencies by adding the
same baseline number to all of them, this can tell you what a good
baseline is. (For multi/en, it's 6.7e-07.)
"""
c = CONN.cursor()
c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
(wordlist, lang))
return c.fetchone()[0]
def iter_wordlist(wordlist='multi', lang=None):
"""
Returns a generator, yielding (word, lang, frequency) triples from
a wordlist in descending order of frequency.
If a `lang` is specified, the results will only contain words in that
language.
"""
c = CONN.cursor()
if lang is None:
results = c.execute(
"SELECT word, lang, freq from words where wordlist=? "
"ORDER BY freq desc",
(wordlist,)
)
else:
results = c.execute(
"SELECT word, lang, freq from words where "
"wordlist=? and lang=? ORDER BY freq DESC",
(wordlist, lang)
)
return results
def wordlist_info(connection=None):
"""
Get info about all the wordlists in a database, returning their
list name, language, and number of words as 'wordlist', 'lang',
and 'count' respectively.
The database connection can be given as an argument, in order to get
information about a database other than the default configured one.
"""
if connection is None:
connection = CONN
c = connection.cursor()
results = c.execute(
"SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
)
for wordlist, lang, count in results:
yield {'wordlist': wordlist, 'lang': lang, 'count': count}
def random_words(nwords=4, bits_per_word=12, wordlist='google-books',
lang='en'):
"""
There are a few reasons you might want to see a sample of words in a
wordlist:
- Generating test cases
- Getting a feel for what a wordlist contains
- Generating passwords as in https://xkcd.com/936/
Parameters:
- `nwords` is the number of words to select.
- `bits_per_word` indicate how many bits of randomness per word you want,
up to log2(wordlist_size). As you increase it, the words get obscure.
- `wordlist` and `lang` specify the wordlist to use.
"""
import random
limit = 2 ** bits_per_word
c = CONN.cursor()
results = c.execute(
"SELECT word from words where wordlist = ? and lang = ? "
"ORDER BY freq DESC LIMIT ?",
(wordlist, lang, limit)
)
words = [row[0] for row in results]
selected = random.sample(words, nwords)
return u' '.join(selected)

View File

@ -1,10 +0,0 @@
This data was compiled from the Google Books Ngram Viewer data, particularly
the 2012 English dataset.
The data is available from https://books.google.com/ngrams. The terms of use of
this data are:
"Ngram Viewer graphs and data may be freely used for any purpose, although
acknowledgement of Google Books Ngram Viewer as the source, and inclusion of a
link to http://books.google.com/ngrams, would be appreciated."

View File

@ -1 +0,0 @@
48b238cc5b3d359d0e8ac48f6321aca27c1ec098

View File

@ -1,5 +0,0 @@
These wordlists come from the University of Leeds Centre for Translation
Studies, and are provided for free under a Creative Commons Attribution
license.
For more information, see: http://corpus.leeds.ac.uk/list.html

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +0,0 @@
This directory contains two wordlists we've put together at Luminoso for our
own purposes. You might find them useful as well.
* `twitter-52M` collects the unigram word frequencies from 52 million tweets.
The words are not distinguished by language.
* `multi` combines various sources of data in different languages, including:
* Google Books, for English
* A smaller corpus of tweets that supposedly come from English speakers
(there's still a lot of non-English text in there)
* the Leeds corpora for various languages (see `../leeds/README.txt`)
We would like to release the tools that built `twitter-52M` as soon as they are
less sloppy. `multi` is a dataset that is mainly relevant because it's the data
we happen to already be using, but you might find it useful as well.

View File

@ -1 +0,0 @@
3f7a03ee49e8f33c2526beb33d61e27968a96b39

View File

@ -1 +0,0 @@
9b29de132c82bd7287c08c2937e3c4821525e356

View File

@ -1 +0,0 @@
956c3ff57edf5c45f3e850efd87a30d25c1b4bee

View File

@ -1 +0,0 @@
4c5a66db8a4190a173814a4d7b31b925c5b131d1

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
3710e65f27753facc699fe56269c9631d5ba6aba

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
1e9d162c0c1333ce4a9afd79cd8686805f1e19c3

View File

@ -1 +0,0 @@
b9d52d81bbe078a7de17519ed3494eb4771f0f69

View File

@ -1 +0,0 @@
f69e13f6be1183f69166fe287ada38354ce4de99