add new data files from wordfreq_builder

Former-commit-id: 35aec061de
2024-12-23 17:31:41 +00:00 · 2015-05-11 18:45:47 -04:00 · 2015-05-11 18:45:47 -04:00 · aa0e844b81
commit aa0e844b81
parent f92598b13d
45 changed files with 204 additions and 639859 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,2 @@
+recursive-include wordfreq/data *.gz
+include README.md
--- a/setup.py
+++ b/setup.py
@ -33,5 +33,6 @@ setup(
    classifiers = classifiers,
    long_description = "\n".join(doclines[2:]),
    packages=['wordfreq'],
+    include_package_data=True,
    install_requires=['ftfy >= 4', 'msgpack-python'],
 )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,16 +1,208 @@
-# Make wordfreq.query available at the top level when needed.
+from pkg_resources import resource_filename
+from functools import lru_cache
+import langcodes
+import msgpack
+import re
+import gzip
+import pathlib
+import random
+import logging
+logger = logging.getLogger(__name__)

-def word_frequency(word, lang, wordlist='multi', offset=0.):
+
+NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]'
+NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
+TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE))
+DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
+
+CACHE_SIZE = 100000
+
+
+def tokenize(text):
+    """
+    A simple tokenizer that can be applied to most languages. Strings that
+    are looked up in wordfreq will be run through this tokenizer first,
+    so that they can be expected to match the data.
+    """
+    return [token.lower() for token in TOKEN_RE.findall(text)]
+
+
+def read_dBpack(filename):
+    """
+    Read a file from an idiosyncratic format that we use for storing
+    approximate word frequencies, called "dBpack".
+
+    The dBpack format is as follows:
+
+    - The file on disk is a gzipped file in msgpack format, which decodes to a
+      list of lists of words.
+
+    - Each inner list of words corresponds to a particular word frequency,
+      rounded to the nearest decibel. 0 dB represents a word that occurs with
+      probability 1, so it is the only word in the data (this of course doesn't
+      happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB
+      represents a word that occurs once per 1000 tokens, and so on.
+
+    - The index of each list within the overall list is the negative of its
+      frequency in decibels.
+
+    - Each inner list is sorted in alphabetical order.
+
+    As an example, consider a corpus consisting only of the words "red fish
+    blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red"
+    and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word
+    frequencies would decode to this list:
+
+        [[], [], [], ['fish'], [], [], ['blue', 'red']]
+    """
+    with gzip.open(filename, 'rb') as infile:
+        got = msgpack.load(infile, encoding='utf-8')
+    return got
+
+
+def available_languages(wordlist='combined'):
+    """
+    List the languages (as language-code strings) that the wordlist of a given
+    name is available in.
+    """
+    available = {}
+    for path in DATA_PATH.glob('*.msgpack.gz'):
+        list_name = path.name.split('.')[0]
+        name, lang = list_name.split('_')
+        if name == wordlist:
+            available[lang] = path
+    return available
+
+
+@lru_cache(maxsize=None)
+def get_frequency_list(lang, wordlist='combined', match_cutoff=50):
+    """
+    Read the raw data from a wordlist file, returning it as a list of
+    lists. (See `read_dBpack` for what this represents.)
+
+    Because we use the `langcodes` module, we can handle slight
+    variations in language codes. For example, looking for 'pt-BR',
+    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
+    Looking up the alternate code 'por' will also get the same list.
+    """
+    available = available_languages(wordlist)
+    best, score = langcodes.best_match(lang, list(available),
+                                       min_score=match_cutoff)
+    if score == 0:
+        raise LookupError("No wordlist available for language %r" % lang)
+
+    # Convert the LanguageData object to a normalized language code
+    got = str(best)
+    if got != lang:
+        logger.warning(
+            "You asked for word frequencies in language %r. Using the "
+            "nearest match, which is %r (%s)."
+            % (lang, best.language_name('en'))
+        )
+
+    filepath = available[str(best)]
+    return read_dBpack(str(filepath))
+
+
+def dB_to_freq(dB):
+    if dB > 0:
+        raise ValueError(
+            "A frequency cannot be a positive number of decibels."
+        )
+    return 10 ** (dB / 10)
+
+
+@lru_cache(maxsize=None)
+def get_frequency_dict(lang, wordlist='combined', match_cutoff=50):
+    """
+    Get a word frequency list as a dictionary, mapping tokens to
+    frequencies as floating-point probabilities.
+    """
+    freqs = {}
+    pack = get_frequency_list(lang, wordlist, match_cutoff)
+    for index, bucket in enumerate(pack):
+        for word in bucket:
+            freqs[word] = dB_to_freq(-index)
+    return freqs
+
+
+def iter_wordlist(lang, wordlist='combined'):
+    """
+    Yield the words in a wordlist in approximate descending order of
+    frequency.
+
+    Because wordfreq rounds off its frequencies, the words will form 'bands'
+    with the same rounded frequency, appearing in alphabetical order within
+    each band.
+    """
+    for sublist in get_frequency_list(lang, wordlist):
+        for word in sublist:
+            yield word
+
+
+@lru_cache(maxsize=CACHE_SIZE)
+def word_frequency(word, lang, wordlist='combined', default=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
-    specified `wordlist`.
+    specified `wordlist`. The default (and currently only) wordlist is
+    'combined', built from whichever of these four sources have sufficient
+    data for the language:

-    The offset gets added to all values, to monotonically account for the
-    fact that we have not observed all possible words.
+      - Full text of Wikipedia
+      - A sample of 72 million tweets collected from Twitter in 2014,
+        divided roughly into languages using automatic language detection
+      - Frequencies extracted from OpenSubtitles
+      - The Leeds Internet Corpus

-    This is a wrapper for the real word_frequency function, so that it can
-    be imported at the top level instead of from `wordfreq.query`.
+    Words that we believe occur at least once per million tokens, based on
+    the average of these lists, will appear in the word frequency list.
+    If you look up a word that's not in the list, you'll get the `default`
+    value, which itself defaults to 0.
+
+    If a word decomposes into multiple tokens, we'll return a smoothed estimate
+    of the word frequency that is no greater than the frequency of any of its
+    individual tokens.
    """
-    from wordfreq.query import word_frequency as _real_word_frequency
-    return _real_word_frequency(word, lang, wordlist=wordlist, offset=offset)
+    freqs = get_frequency_dict(lang, wordlist)
+    combined_value = None
+    for token in tokenize(word):
+        if token not in freqs:
+            # If any word is missing, just return the default value
+            return default
+        value = freqs[token]
+        if combined_value is None:
+            combined_value = value
+        else:
+            # Combine word values using the half-harmonic-mean formula,
+            # (a * b) / (a + b). This operation is associative.
+            combined_value = (combined_value * value) / (combined_value + value)
+    return combined_value

+
+@lru_cache(maxsize=100)
+def top_n_list(lang, n, wordlist='combined', ascii_only=False):
+    results = []
+    for word in iter_wordlist(lang, wordlist):
+        if (not ascii_only) or max(word) <= '~':
+            results.append(word)
+            if len(results) >= n:
+                break
+    return results
+
+
+def random_words(nwords=4, lang='en', wordlist='combined', bits_per_word=12,
+                 ascii_only=False):
+    n_choices = 2 ** bits_per_word
+    choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only)
+    if len(choices) < n_choices:
+        raise ValueError(
+            "There aren't enough words in the wordlist to provide %d bits of "
+            "entropy per word." % bits_per_word
+        )
+    selected = [random.choice(choices) for i in range(nwords)]
+    return ' '.join(selected)
+
+
+def random_ascii_words(nwords=4, lang='en', wordlist='combined',
+                       bits_per_word=12):
+    return random_words(nwords, lang, wordlist, bits_per_word, ascii_only=True)
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@ -1,128 +0,0 @@
-from wordfreq.config import DB_FILENAME, CACHE_SIZE
-from functools import lru_cache
-
-
-@lru_cache(maxsize=CACHE_SIZE)
-def word_frequency(word, lang, wordlist='multi', offset=0.):
-    """
-    Get the frequency of `word` in the language with code `lang`, from the
-    specified `wordlist`.
-
-    The offset gets added to all values, to monotonically account for the
-    fact that we have not observed all possible words.
-    """
-    c = CONN.cursor()
-    c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
-              (standardize_word(word), lang, wordlist))
-    row = c.fetchone()
-    if row is None:
-        return offset
-    else:
-        return row[0] + offset
-
-
-def wordlist_size(wordlist, lang=None):
-    """
-    Get the number of words in a wordlist.
-    """
-    c = CONN.cursor()
-    if lang is None:
-        c.execute(
-            "SELECT count(*) from words where wordlist=?",
-            (wordlist,)
-        )
-    else:
-        c.execute(
-            "SELECT count(*) from words where wordlist=? and lang=?",
-            (wordlist, lang)
-        )
-    return c.fetchone()[0]
-
-
-def average_frequency(wordlist, lang):
-    """
-    A kind of slow function to get the average frequency for words in a
-    wordlist.
-
-    If, for example, you're smoothing over word frequencies by adding the
-    same baseline number to all of them, this can tell you what a good
-    baseline is. (For multi/en, it's 6.7e-07.)
-    """
-    c = CONN.cursor()
-    c.execute("SELECT avg(freq) from words where wordlist=? and lang=?",
-              (wordlist, lang))
-    return c.fetchone()[0]
-
-
-def iter_wordlist(wordlist='multi', lang=None):
-    """
-    Returns a generator, yielding (word, lang, frequency) triples from
-    a wordlist in descending order of frequency.
-
-    If a `lang` is specified, the results will only contain words in that
-    language.
-    """
-    c = CONN.cursor()
-    if lang is None:
-        results = c.execute(
-            "SELECT word, lang, freq from words where wordlist=? "
-            "ORDER BY freq desc",
-            (wordlist,)
-        )
-    else:
-        results = c.execute(
-            "SELECT word, lang, freq from words where "
-            "wordlist=? and lang=? ORDER BY freq DESC",
-            (wordlist, lang)
-        )
-
-    return results
-
-
-def wordlist_info(connection=None):
-    """
-    Get info about all the wordlists in a database, returning their
-    list name, language, and number of words as 'wordlist', 'lang',
-    and 'count' respectively.
-
-    The database connection can be given as an argument, in order to get
-    information about a database other than the default configured one.
-    """
-    if connection is None:
-        connection = CONN
-    c = connection.cursor()
-    results = c.execute(
-        "SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
-    )
-    for wordlist, lang, count in results:
-        yield {'wordlist': wordlist, 'lang': lang, 'count': count}
-
-
-def random_words(nwords=4, bits_per_word=12, wordlist='google-books',
-                 lang='en'):
-    """
-    There are a few reasons you might want to see a sample of words in a
-    wordlist:
-
-    - Generating test cases
-    - Getting a feel for what a wordlist contains
-    - Generating passwords as in https://xkcd.com/936/
-
-    Parameters:
-
-    - `nwords` is the number of words to select.
-    - `bits_per_word` indicate how many bits of randomness per word you want,
-      up to log2(wordlist_size). As you increase it, the words get obscure.
-    - `wordlist` and `lang` specify the wordlist to use.
-    """
-    import random
-    limit = 2 ** bits_per_word
-    c = CONN.cursor()
-    results = c.execute(
-        "SELECT word from words where wordlist = ? and lang = ? "
-        "ORDER BY freq DESC LIMIT ?",
-        (wordlist, lang, limit)
-    )
-    words = [row[0] for row in results]
-    selected = random.sample(words, nwords)
-    return u' '.join(selected)
--- a/wordfreq_data/google/README.txt
+++ b/wordfreq_data/google/README.txt
@ -1,10 +0,0 @@
-This data was compiled from the Google Books Ngram Viewer data, particularly
-the 2012 English dataset.
-
-The data is available from https://books.google.com/ngrams. The terms of use of
-this data are:
-
-"Ngram Viewer graphs and data may be freely used for any purpose, although
-acknowledgement of Google Books Ngram Viewer as the source, and inclusion of a
-link to http://books.google.com/ngrams, would be appreciated."
-
--- a/wordfreq_data/google/google-books-english.csv.REMOVED.git-id
+++ b/wordfreq_data/google/google-books-english.csv.REMOVED.git-id
@ -1 +0,0 @@
-48b238cc5b3d359d0e8ac48f6321aca27c1ec098
--- a/wordfreq_data/leeds/README.txt
+++ b/wordfreq_data/leeds/README.txt
@ -1,5 +0,0 @@
-These wordlists come from the University of Leeds Centre for Translation
-Studies, and are provided for free under a Creative Commons Attribution
-license.
-
-For more information, see: http://corpus.leeds.ac.uk/list.html
--- a/wordfreq_data/leeds/internet-ar-forms.num
+++ b/wordfreq_data/leeds/internet-ar-forms.num
--- a/wordfreq_data/leeds/internet-de-forms.num
+++ b/wordfreq_data/leeds/internet-de-forms.num
--- a/wordfreq_data/leeds/internet-el-forms.num
+++ b/wordfreq_data/leeds/internet-el-forms.num
--- a/wordfreq_data/leeds/internet-en-forms.num
+++ b/wordfreq_data/leeds/internet-en-forms.num
--- a/wordfreq_data/leeds/internet-es-forms.num
+++ b/wordfreq_data/leeds/internet-es-forms.num
--- a/wordfreq_data/leeds/internet-fr-forms.num
+++ b/wordfreq_data/leeds/internet-fr-forms.num
--- a/wordfreq_data/leeds/internet-it-forms.num
+++ b/wordfreq_data/leeds/internet-it-forms.num
--- a/wordfreq_data/leeds/internet-ja-forms.num
+++ b/wordfreq_data/leeds/internet-ja-forms.num
--- a/wordfreq_data/leeds/internet-pt-forms.num
+++ b/wordfreq_data/leeds/internet-pt-forms.num
--- a/wordfreq_data/leeds/internet-ru-forms.num
+++ b/wordfreq_data/leeds/internet-ru-forms.num
--- a/wordfreq_data/leeds/internet-zh-forms.num
+++ b/wordfreq_data/leeds/internet-zh-forms.num
--- a/wordfreq_data/leeds/rnc-modern.num.html
+++ b/wordfreq_data/leeds/rnc-modern.num.html
--- a/wordfreq_data/luminoso/README.txt
+++ b/wordfreq_data/luminoso/README.txt
@ -1,17 +0,0 @@
-This directory contains two wordlists we've put together at Luminoso for our
-own purposes. You might find them useful as well.
-
-* `twitter-52M` collects the unigram word frequencies from 52 million tweets.
-  The words are not distinguished by language.
-
-* `multi` combines various sources of data in different languages, including:
-
-  * Google Books, for English
-  * A smaller corpus of tweets that supposedly come from English speakers
-    (there's still a lot of non-English text in there)
-  * the Leeds corpora for various languages (see `../leeds/README.txt`)
-
-We would like to release the tools that built `twitter-52M` as soon as they are
-less sloppy. `multi` is a dataset that is mainly relevant because it's the data
-we happen to already be using, but you might find it useful as well.
-
--- a/wordfreq_data/luminoso/multilingual.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/multilingual.csv.REMOVED.git-id
@ -1 +0,0 @@
-3f7a03ee49e8f33c2526beb33d61e27968a96b39
--- a/wordfreq_data/luminoso/nl-combined-201503.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/nl-combined-201503.csv.REMOVED.git-id
@ -1 +0,0 @@
-9b29de132c82bd7287c08c2937e3c4821525e356
--- a/wordfreq_data/luminoso/nl-combined-201504.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/nl-combined-201504.csv.REMOVED.git-id
@ -1 +0,0 @@
-956c3ff57edf5c45f3e850efd87a30d25c1b4bee
--- a/wordfreq_data/luminoso/twitter-52M.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/twitter-52M.csv.REMOVED.git-id
@ -1 +0,0 @@
-4c5a66db8a4190a173814a4d7b31b925c5b131d1
--- a/wordfreq_data/luminoso/twitter-stems-2014-nl.csv
+++ b/wordfreq_data/luminoso/twitter-stems-2014-nl.csv
--- a/wordfreq_data/luminoso/twitter-stems-2014.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/twitter-stems-2014.csv.REMOVED.git-id
@ -1 +0,0 @@
-3710e65f27753facc699fe56269c9631d5ba6aba
--- a/wordfreq_data/luminoso/twitter-surfaces-2014-nl.csv
+++ b/wordfreq_data/luminoso/twitter-surfaces-2014-nl.csv
--- a/wordfreq_data/luminoso/twitter-surfaces-2014.csv.REMOVED.git-id
+++ b/wordfreq_data/luminoso/twitter-surfaces-2014.csv.REMOVED.git-id
@ -1 +0,0 @@
-1e9d162c0c1333ce4a9afd79cd8686805f1e19c3
--- a/wordfreq_data/wikipedia/stems-nl.csv.REMOVED.git-id
+++ b/wordfreq_data/wikipedia/stems-nl.csv.REMOVED.git-id
@ -1 +0,0 @@
-b9d52d81bbe078a7de17519ed3494eb4771f0f69
--- a/wordfreq_data/wikipedia/surfaces-nl.csv.REMOVED.git-id
+++ b/wordfreq_data/wikipedia/surfaces-nl.csv.REMOVED.git-id
@ -1 +0,0 @@
-f69e13f6be1183f69166fe287ada38354ce4de99