mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
f92598b13d
commit
aa0e844b81
2
MANIFEST.in
Normal file
2
MANIFEST.in
Normal file
@ -0,0 +1,2 @@
|
||||
recursive-include wordfreq/data *.gz
|
||||
include README.md
|
1
setup.py
1
setup.py
@ -33,5 +33,6 @@ setup(
|
||||
classifiers = classifiers,
|
||||
long_description = "\n".join(doclines[2:]),
|
||||
packages=['wordfreq'],
|
||||
include_package_data=True,
|
||||
install_requires=['ftfy >= 4', 'msgpack-python'],
|
||||
)
|
||||
|
@ -1,16 +1,208 @@
|
||||
# Make wordfreq.query available at the top level when needed.
|
||||
from pkg_resources import resource_filename
|
||||
from functools import lru_cache
|
||||
import langcodes
|
||||
import msgpack
|
||||
import re
|
||||
import gzip
|
||||
import pathlib
|
||||
import random
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def word_frequency(word, lang, wordlist='multi', offset=0.):
|
||||
|
||||
NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]'
|
||||
NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
|
||||
TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE))
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
CACHE_SIZE = 100000
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
"""
|
||||
A simple tokenizer that can be applied to most languages. Strings that
|
||||
are looked up in wordfreq will be run through this tokenizer first,
|
||||
so that they can be expected to match the data.
|
||||
"""
|
||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
def read_dBpack(filename):
|
||||
"""
|
||||
Read a file from an idiosyncratic format that we use for storing
|
||||
approximate word frequencies, called "dBpack".
|
||||
|
||||
The dBpack format is as follows:
|
||||
|
||||
- The file on disk is a gzipped file in msgpack format, which decodes to a
|
||||
list of lists of words.
|
||||
|
||||
- Each inner list of words corresponds to a particular word frequency,
|
||||
rounded to the nearest decibel. 0 dB represents a word that occurs with
|
||||
probability 1, so it is the only word in the data (this of course doesn't
|
||||
happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB
|
||||
represents a word that occurs once per 1000 tokens, and so on.
|
||||
|
||||
- The index of each list within the overall list is the negative of its
|
||||
frequency in decibels.
|
||||
|
||||
- Each inner list is sorted in alphabetical order.
|
||||
|
||||
As an example, consider a corpus consisting only of the words "red fish
|
||||
blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red"
|
||||
and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word
|
||||
frequencies would decode to this list:
|
||||
|
||||
[[], [], [], ['fish'], [], [], ['blue', 'red']]
|
||||
"""
|
||||
with gzip.open(filename, 'rb') as infile:
|
||||
got = msgpack.load(infile, encoding='utf-8')
|
||||
return got
|
||||
|
||||
|
||||
def available_languages(wordlist='combined'):
|
||||
"""
|
||||
List the languages (as language-code strings) that the wordlist of a given
|
||||
name is available in.
|
||||
"""
|
||||
available = {}
|
||||
for path in DATA_PATH.glob('*.msgpack.gz'):
|
||||
list_name = path.name.split('.')[0]
|
||||
name, lang = list_name.split('_')
|
||||
if name == wordlist:
|
||||
available[lang] = path
|
||||
return available
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_frequency_list(lang, wordlist='combined', match_cutoff=50):
|
||||
"""
|
||||
Read the raw data from a wordlist file, returning it as a list of
|
||||
lists. (See `read_dBpack` for what this represents.)
|
||||
|
||||
Because we use the `langcodes` module, we can handle slight
|
||||
variations in language codes. For example, looking for 'pt-BR',
|
||||
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
|
||||
Looking up the alternate code 'por' will also get the same list.
|
||||
"""
|
||||
available = available_languages(wordlist)
|
||||
best, score = langcodes.best_match(lang, list(available),
|
||||
min_score=match_cutoff)
|
||||
if score == 0:
|
||||
raise LookupError("No wordlist available for language %r" % lang)
|
||||
|
||||
# Convert the LanguageData object to a normalized language code
|
||||
got = str(best)
|
||||
if got != lang:
|
||||
logger.warning(
|
||||
"You asked for word frequencies in language %r. Using the "
|
||||
"nearest match, which is %r (%s)."
|
||||
% (lang, best.language_name('en'))
|
||||
)
|
||||
|
||||
filepath = available[str(best)]
|
||||
return read_dBpack(str(filepath))
|
||||
|
||||
|
||||
def dB_to_freq(dB):
|
||||
if dB > 0:
|
||||
raise ValueError(
|
||||
"A frequency cannot be a positive number of decibels."
|
||||
)
|
||||
return 10 ** (dB / 10)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_frequency_dict(lang, wordlist='combined', match_cutoff=50):
|
||||
"""
|
||||
Get a word frequency list as a dictionary, mapping tokens to
|
||||
frequencies as floating-point probabilities.
|
||||
"""
|
||||
freqs = {}
|
||||
pack = get_frequency_list(lang, wordlist, match_cutoff)
|
||||
for index, bucket in enumerate(pack):
|
||||
for word in bucket:
|
||||
freqs[word] = dB_to_freq(-index)
|
||||
return freqs
|
||||
|
||||
|
||||
def iter_wordlist(lang, wordlist='combined'):
|
||||
"""
|
||||
Yield the words in a wordlist in approximate descending order of
|
||||
frequency.
|
||||
|
||||
Because wordfreq rounds off its frequencies, the words will form 'bands'
|
||||
with the same rounded frequency, appearing in alphabetical order within
|
||||
each band.
|
||||
"""
|
||||
for sublist in get_frequency_list(lang, wordlist):
|
||||
for word in sublist:
|
||||
yield word
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_SIZE)
|
||||
def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
specified `wordlist`.
|
||||
specified `wordlist`. The default (and currently only) wordlist is
|
||||
'combined', built from whichever of these four sources have sufficient
|
||||
data for the language:
|
||||
|
||||
The offset gets added to all values, to monotonically account for the
|
||||
fact that we have not observed all possible words.
|
||||
- Full text of Wikipedia
|
||||
- A sample of 72 million tweets collected from Twitter in 2014,
|
||||
divided roughly into languages using automatic language detection
|
||||
- Frequencies extracted from OpenSubtitles
|
||||
- The Leeds Internet Corpus
|
||||
|
||||
This is a wrapper for the real word_frequency function, so that it can
|
||||
be imported at the top level instead of from `wordfreq.query`.
|
||||
Words that we believe occur at least once per million tokens, based on
|
||||
the average of these lists, will appear in the word frequency list.
|
||||
If you look up a word that's not in the list, you'll get the `default`
|
||||
value, which itself defaults to 0.
|
||||
|
||||
If a word decomposes into multiple tokens, we'll return a smoothed estimate
|
||||
of the word frequency that is no greater than the frequency of any of its
|
||||
individual tokens.
|
||||
"""
|
||||
from wordfreq.query import word_frequency as _real_word_frequency
|
||||
return _real_word_frequency(word, lang, wordlist=wordlist, offset=offset)
|
||||
freqs = get_frequency_dict(lang, wordlist)
|
||||
combined_value = None
|
||||
for token in tokenize(word):
|
||||
if token not in freqs:
|
||||
# If any word is missing, just return the default value
|
||||
return default
|
||||
value = freqs[token]
|
||||
if combined_value is None:
|
||||
combined_value = value
|
||||
else:
|
||||
# Combine word values using the half-harmonic-mean formula,
|
||||
# (a * b) / (a + b). This operation is associative.
|
||||
combined_value = (combined_value * value) / (combined_value + value)
|
||||
return combined_value
|
||||
|
||||
|
||||
@lru_cache(maxsize=100)
|
||||
def top_n_list(lang, n, wordlist='combined', ascii_only=False):
|
||||
results = []
|
||||
for word in iter_wordlist(lang, wordlist):
|
||||
if (not ascii_only) or max(word) <= '~':
|
||||
results.append(word)
|
||||
if len(results) >= n:
|
||||
break
|
||||
return results
|
||||
|
||||
|
||||
def random_words(nwords=4, lang='en', wordlist='combined', bits_per_word=12,
|
||||
ascii_only=False):
|
||||
n_choices = 2 ** bits_per_word
|
||||
choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only)
|
||||
if len(choices) < n_choices:
|
||||
raise ValueError(
|
||||
"There aren't enough words in the wordlist to provide %d bits of "
|
||||
"entropy per word." % bits_per_word
|
||||
)
|
||||
selected = [random.choice(choices) for i in range(nwords)]
|
||||
return ' '.join(selected)
|
||||
|
||||
|
||||
def random_ascii_words(nwords=4, lang='en', wordlist='combined',
|
||||
bits_per_word=12):
|
||||
return random_words(nwords, lang, wordlist, bits_per_word, ascii_only=True)
|
||||
|
BIN
wordfreq/data/combined_ar.msgpack.gz
Normal file
BIN
wordfreq/data/combined_ar.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_de.msgpack.gz
Normal file
BIN
wordfreq/data/combined_de.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_el.msgpack.gz
Normal file
BIN
wordfreq/data/combined_el.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_en.msgpack.gz
Normal file
BIN
wordfreq/data/combined_en.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_es.msgpack.gz
Normal file
BIN
wordfreq/data/combined_es.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_fr.msgpack.gz
Normal file
BIN
wordfreq/data/combined_fr.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_id.msgpack.gz
Normal file
BIN
wordfreq/data/combined_id.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_it.msgpack.gz
Normal file
BIN
wordfreq/data/combined_it.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_ja.msgpack.gz
Normal file
BIN
wordfreq/data/combined_ja.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_ko.msgpack.gz
Normal file
BIN
wordfreq/data/combined_ko.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_ms.msgpack.gz
Normal file
BIN
wordfreq/data/combined_ms.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_nl.msgpack.gz
Normal file
BIN
wordfreq/data/combined_nl.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_pt.msgpack.gz
Normal file
BIN
wordfreq/data/combined_pt.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_ru.msgpack.gz
Normal file
BIN
wordfreq/data/combined_ru.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/combined_zh.msgpack.gz
Normal file
BIN
wordfreq/data/combined_zh.msgpack.gz
Normal file
Binary file not shown.
@ -1,128 +0,0 @@
|
||||
from wordfreq.config import DB_FILENAME, CACHE_SIZE
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_SIZE)
|
||||
def word_frequency(word, lang, wordlist='multi', offset=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
specified `wordlist`.
|
||||
|
||||
The offset gets added to all values, to monotonically account for the
|
||||
fact that we have not observed all possible words.
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
|
||||
(standardize_word(word), lang, wordlist))
|
||||
row = c.fetchone()
|
||||
if row is None:
|
||||
return offset
|
||||
else:
|
||||
return row[0] + offset
|
||||
|
||||
|
||||
def wordlist_size(wordlist, lang=None):
|
||||
"""
|
||||
Get the number of words in a wordlist.
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
if lang is None:
|
||||
c.execute(
|
||||
"SELECT count(*) from words where wordlist=?",
|
||||
(wordlist,)
|
||||
)
|
||||
else:
|
||||
c.execute(
|
||||
"SELECT count(*) from words where wordlist=? and lang=?",
|
||||
(wordlist, lang)
|
||||
)
|
||||
return c.fetchone()[0]
|
||||
|
||||
|
||||
def average_frequency(wordlist, lang):
|
||||
"""
|
||||
A kind of slow function to get the average frequency for words in a
|
||||
wordlist.
|
||||
|
||||
If, for example, you're smoothing over word frequencies by adding the
|
||||
same baseline number to all of them, this can tell you what a good
|
||||
baseline is. (For multi/en, it's 6.7e-07.)
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
|
||||
(wordlist, lang))
|
||||
return c.fetchone()[0]
|
||||
|
||||
|
||||
def iter_wordlist(wordlist='multi', lang=None):
|
||||
"""
|
||||
Returns a generator, yielding (word, lang, frequency) triples from
|
||||
a wordlist in descending order of frequency.
|
||||
|
||||
If a `lang` is specified, the results will only contain words in that
|
||||
language.
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
if lang is None:
|
||||
results = c.execute(
|
||||
"SELECT word, lang, freq from words where wordlist=? "
|
||||
"ORDER BY freq desc",
|
||||
(wordlist,)
|
||||
)
|
||||
else:
|
||||
results = c.execute(
|
||||
"SELECT word, lang, freq from words where "
|
||||
"wordlist=? and lang=? ORDER BY freq DESC",
|
||||
(wordlist, lang)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def wordlist_info(connection=None):
|
||||
"""
|
||||
Get info about all the wordlists in a database, returning their
|
||||
list name, language, and number of words as 'wordlist', 'lang',
|
||||
and 'count' respectively.
|
||||
|
||||
The database connection can be given as an argument, in order to get
|
||||
information about a database other than the default configured one.
|
||||
"""
|
||||
if connection is None:
|
||||
connection = CONN
|
||||
c = connection.cursor()
|
||||
results = c.execute(
|
||||
"SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
|
||||
)
|
||||
for wordlist, lang, count in results:
|
||||
yield {'wordlist': wordlist, 'lang': lang, 'count': count}
|
||||
|
||||
|
||||
def random_words(nwords=4, bits_per_word=12, wordlist='google-books',
|
||||
lang='en'):
|
||||
"""
|
||||
There are a few reasons you might want to see a sample of words in a
|
||||
wordlist:
|
||||
|
||||
- Generating test cases
|
||||
- Getting a feel for what a wordlist contains
|
||||
- Generating passwords as in https://xkcd.com/936/
|
||||
|
||||
Parameters:
|
||||
|
||||
- `nwords` is the number of words to select.
|
||||
- `bits_per_word` indicate how many bits of randomness per word you want,
|
||||
up to log2(wordlist_size). As you increase it, the words get obscure.
|
||||
- `wordlist` and `lang` specify the wordlist to use.
|
||||
"""
|
||||
import random
|
||||
limit = 2 ** bits_per_word
|
||||
c = CONN.cursor()
|
||||
results = c.execute(
|
||||
"SELECT word from words where wordlist = ? and lang = ? "
|
||||
"ORDER BY freq DESC LIMIT ?",
|
||||
(wordlist, lang, limit)
|
||||
)
|
||||
words = [row[0] for row in results]
|
||||
selected = random.sample(words, nwords)
|
||||
return u' '.join(selected)
|
@ -1,10 +0,0 @@
|
||||
This data was compiled from the Google Books Ngram Viewer data, particularly
|
||||
the 2012 English dataset.
|
||||
|
||||
The data is available from https://books.google.com/ngrams. The terms of use of
|
||||
this data are:
|
||||
|
||||
"Ngram Viewer graphs and data may be freely used for any purpose, although
|
||||
acknowledgement of Google Books Ngram Viewer as the source, and inclusion of a
|
||||
link to http://books.google.com/ngrams, would be appreciated."
|
||||
|
@ -1 +0,0 @@
|
||||
48b238cc5b3d359d0e8ac48f6321aca27c1ec098
|
@ -1,5 +0,0 @@
|
||||
These wordlists come from the University of Leeds Centre for Translation
|
||||
Studies, and are provided for free under a Creative Commons Attribution
|
||||
license.
|
||||
|
||||
For more information, see: http://corpus.leeds.ac.uk/list.html
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,17 +0,0 @@
|
||||
This directory contains two wordlists we've put together at Luminoso for our
|
||||
own purposes. You might find them useful as well.
|
||||
|
||||
* `twitter-52M` collects the unigram word frequencies from 52 million tweets.
|
||||
The words are not distinguished by language.
|
||||
|
||||
* `multi` combines various sources of data in different languages, including:
|
||||
|
||||
* Google Books, for English
|
||||
* A smaller corpus of tweets that supposedly come from English speakers
|
||||
(there's still a lot of non-English text in there)
|
||||
* the Leeds corpora for various languages (see `../leeds/README.txt`)
|
||||
|
||||
We would like to release the tools that built `twitter-52M` as soon as they are
|
||||
less sloppy. `multi` is a dataset that is mainly relevant because it's the data
|
||||
we happen to already be using, but you might find it useful as well.
|
||||
|
@ -1 +0,0 @@
|
||||
3f7a03ee49e8f33c2526beb33d61e27968a96b39
|
@ -1 +0,0 @@
|
||||
9b29de132c82bd7287c08c2937e3c4821525e356
|
@ -1 +0,0 @@
|
||||
956c3ff57edf5c45f3e850efd87a30d25c1b4bee
|
@ -1 +0,0 @@
|
||||
4c5a66db8a4190a173814a4d7b31b925c5b131d1
|
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
3710e65f27753facc699fe56269c9631d5ba6aba
|
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
||||
1e9d162c0c1333ce4a9afd79cd8686805f1e19c3
|
@ -1 +0,0 @@
|
||||
b9d52d81bbe078a7de17519ed3494eb4771f0f69
|
@ -1 +0,0 @@
|
||||
f69e13f6be1183f69166fe287ada38354ce4de99
|
Loading…
Reference in New Issue
Block a user