Merge pull request #15 from LuminosoInsight/wordfreq-review

General style fixes and improvements from the code review Former-commit-id: 8686a47a30
2024-12-23 17:31:41 +00:00 · 2015-07-07 16:53:17 -04:00 · 2015-07-07 16:53:17 -04:00 · ad165d2830
commit ad165d2830
parent 090cfa7088 d4409a2214
5 changed files with 155 additions and 149 deletions
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -0,0 +1,92 @@
+import unicodedata
+from ftfy import chardata
+import pathlib
+from pkg_resources import resource_filename
+
+
+DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
+
+
+def cache_regex_from_func(filename, func):
+    """
+    Generates a regex from a function that accepts a single unicode character,
+    and caches it in the data path at filename.
+    """
+    with (DATA_PATH / filename).open(mode='w') as file:
+        file.write(func_to_regex(func))
+
+
+def _emoji_char_class():
+    """
+    Build a regex for emoji substitution.  We create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji.
+    """
+    cache_regex_from_func(
+        'emoji.txt',
+        lambda c:
+            chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
+            c >= '\u2600' and c != '\ufffd'
+    )
+
+
+def _non_punct_class():
+    """
+    Builds a regex that matches anything that is not one of the following
+    classes:
+    - P: punctuation
+    - S: symbols
+    - Z: separators
+    - C: control characters
+    This will classify symbols, including emoji, as punctuation; callers that
+    want to treat emoji separately should filter them out first.
+    """
+    cache_regex_from_func(
+        'non_punct.txt',
+        lambda c: unicodedata.category(c)[0] not in 'PSZC'
+    )
+
+
+def _combining_mark_class():
+    """
+    Builds a regex that matches anything that is a combining mark
+    """
+    cache_regex_from_func(
+        'combining_mark.txt',
+        lambda c: unicodedata.category(c)[0] == 'M'
+    )
+
+
+def func_to_regex(accept):
+    """
+    Converts a function that accepts a single unicode character into a regex.
+    Unassigned unicode characters are treated like their neighbors.
+    """
+    ranges = []
+    start = None
+    has_accepted = False
+    for x in range(0x110000):
+        c = chr(x)
+
+        if accept(c):
+            has_accepted = True
+            if start is None:
+                start = c
+        elif unicodedata.category(c) == 'Cn':
+            if start is None:
+                start = c
+        elif start is not None:
+            if has_accepted:
+                ranges.append('-'.join([start, chr(x-1)]))
+                has_accepted = False
+            start = None
+    else:
+        if has_accepted and start is not None:
+            ranges.append('-'.join([start, chr(x-1)]))
+
+    return '[%s]' % ''.join(ranges)
+
+
+if __name__ == '__main__':
+    _combining_mark_class()
+    _non_punct_class()
+    _emoji_char_class()
--- a/tests/test.py
+++ b/tests/test.py
@ -1,9 +1,10 @@
 from wordfreq import (
-    word_frequency, available_languages, cB_to_freq, iter_wordlist,
-    top_n_list, random_words, random_ascii_words, tokenize
+    word_frequency, available_languages, cB_to_freq,
+    top_n_list, random_words, random_ascii_words, tokenize,
+    half_harmonic_mean
 )
 from nose.tools import (
-    eq_, assert_almost_equal, assert_greater, assert_less, raises
+    eq_, assert_almost_equal, assert_greater, raises
 )


@ -43,10 +44,10 @@ def test_twitter():
                       word_frequency('rt', lang, 'combined'))


-def test_defaults():
+def test_minimums():
    eq_(word_frequency('esquivalience', 'en'), 0)
-    eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6)
-
+    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
+    eq_(word_frequency('the', 'en', minimum=1), 1)

 def test_most_common_words():
    # If something causes the most common words in well-supported languages to
@ -96,7 +97,6 @@ def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data, while the fake word "plan't" can't be found.
    eq_(tokenize("can't", 'en'), ["can't"])
-    eq_(tokenize("plan't", 'en'), ["plan't"])

    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

@ -113,8 +113,13 @@ def test_casefolding():
 def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
-    assert_less(plant, word_frequency('plan', 'en'))
-    assert_less(plant, word_frequency('t', 'en'))
+    assert_almost_equal(
+        plant,
+        half_harmonic_mean(
+            word_frequency('plan', 'en'),
+            word_frequency('t', 'en')
+            )
+        )


 def test_not_really_random():
@ -132,3 +137,14 @@ def test_not_really_random():
@raises(ValueError)
 def test_not_enough_ascii():
    random_ascii_words(lang='zh')
+
+def test_ar():
+    eq_(
+        tokenize('متــــــــعب', 'ar'),
+        ['متعب']
+    )
+
+    eq_(
+        tokenize('حَرَكَات', 'ar'),
+        ['حركات']
+    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,12 +1,10 @@
 from pkg_resources import resource_filename
 from functools import lru_cache
-import unicodedata
-from ftfy import chardata
 import langcodes
-import itertools
 import msgpack
 import re
 import gzip
+import itertools
 import pathlib
 import random
 import logging
@ -16,125 +14,22 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

 CACHE_SIZE = 100000

-def _emoji_char_class():
+def load_range(filename):
    """
-    Build a regex for emoji substitution.  First we create a regex character set
-    (like "[a-cv-z]") matching characters we consider emoji (see the docstring
-    of _replace_problem_text()).  The final regex matches one such character
-    followed by any number of spaces and identical characters.
+    Loads a file from the data path
    """
-    ranges = []
-    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
-        if c == '3' and i >= 0x2600 and i != 0xfffd:
-            if ranges and i == ranges[-1][1] + 1:
-                ranges[-1][1] = i
-            else:
-                ranges.append([i, i])
-    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
-
-EMOJI_RANGE = _emoji_char_class()
-
-def _non_punct_class():
-    """
-    Builds a regex that matches anything that is not a one of the following
-    classes:
-    - P: punctuation
-    - S: symbols
-    - Z: separators
-    - C: control characters
-    This will classify symbols, including emoji, as punctuation; callers that
-    want to treat emoji separately should filter them out first.
-    """
-    non_punct_file = DATA_PATH / 'non_punct.txt'
-    try:
-        with non_punct_file.open() as file:
+    with (DATA_PATH / filename).open() as file:
        return file.read()
-    except FileNotFoundError:

-        out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
+EMOJI_RANGE = load_range('emoji.txt')
+NON_PUNCT_RANGE = load_range('non_punct.txt')
+COMBINING_MARK_RANGE = load_range('combining_mark.txt')

-        with non_punct_file.open(mode='w') as file:
-            file.write(out)
-
-        return out
-
-def _combining_mark_class():
-    """
-    Builds a regex that matches anything that is a combining mark
-    """
-    _combining_mark_file = DATA_PATH / 'combining_mark.txt'
-    try:
-        with _combining_mark_file.open() as file:
-            return file.read()
-    except FileNotFoundError:
-
-        out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
-
-        with _combining_mark_file.open(mode='w') as file:
-            file.write(out)
-
-        return out
-
-
-def func_to_ranges(accept):
-    """
-    Converts a function that accepts a single unicode character into a list of
-    ranges. Unassigned unicode are automatically accepted.
-    """
-    ranges = []
-    start = None
-    for x in range(0x110000):
-        cat = unicodedata.category(chr(x))
-        if cat == 'Cn' or accept(chr(x)):
-            if start is None:
-                start = x
-        else:
-            if start is not None:
-                ranges.append((start, x-1))
-                start = None
-
-    if start is not None:
-        ranges.append((start, x))
-
-    return ranges
-
-unassigned_ranges = None
-
-def func_to_regex(accept):
-    """
-    Converts a function that accepts a single unicode character into a regex.
-    Unassigned unicode characters are treated like their neighbors.
-    """
-    ranges = []
-    start = None
-    for x in range(0x110000):
-        cat = unicodedata.category(chr(x))
-        if cat == 'Cn' or accept(chr(x)):
-            if start is None:
-                start = x
-        else:
-            if start is not None:
-                ranges.append((start, x-1))
-                start = None
-
-    if start is not None:
-        ranges.append((start, x))
-
-    global unassigned_ranges
-    if unassigned_ranges is None:
-        unassigned_ranges = set(func_to_ranges(lambda _: False))
-
-    ranges = [range for range in ranges if range not in unassigned_ranges]
-
-    return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
-                                for start, end in ranges)
-
-
-COMBINING_MARK_RE = re.compile(_combining_mark_class())
-NON_PUNCT_RANGE = _non_punct_class()
+COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)

 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))

+
 def simple_tokenize(text):
    """
    A simple tokenizer that can be applied to most languages.
@ -169,12 +64,10 @@ def tokenize(text, lang):
        if mecab_tokenize is None:
            from wordfreq.mecab import mecab_tokenize
        return mecab_tokenize(text)
-    elif lang == 'ar':
-        tokens = simple_tokenize(text)
-        tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel
-        tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens]
-        return [token for token in tokens if token] # remove empty strings
-    else:
+
+    if lang == 'ar':
+        text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+
    return simple_tokenize(text)


@ -284,7 +177,7 @@ def cB_to_freq(cB):
    """
    if cB > 0:
        raise ValueError(
-            "A frequency cannot be a positive number of decibels."
+            "A frequency cannot be a positive number of centibels."
        )
    return 10 ** (cB / 100)

@ -298,8 +191,9 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
    freqs = {}
    pack = get_frequency_list(lang, wordlist, match_cutoff)
    for index, bucket in enumerate(pack):
+        freq = cB_to_freq(-index)
        for word in bucket:
-            freqs[word] = cB_to_freq(-index)
+            freqs[word] = freq
    return freqs


@ -312,8 +206,7 @@ def iter_wordlist(lang, wordlist='combined'):
    with the same rounded frequency, appearing in alphabetical order within
    each band.
    """
-    for sublist in get_frequency_list(lang, wordlist):
-        yield from sublist
+    return itertools.chain(*get_frequency_list(lang, wordlist))


 def half_harmonic_mean(a, b):
@ -328,25 +221,26 @@ def half_harmonic_mean(a, b):


@lru_cache(maxsize=CACHE_SIZE)
-def word_frequency(word, lang, wordlist='combined', default=0.):
+def word_frequency(word, lang, wordlist='combined', minimum=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
    specified `wordlist`. The default wordlist is 'combined', built from
-    whichever of these four sources have sufficient data for the language:
+    whichever of these five sources have sufficient data for the language:

      - Full text of Wikipedia
      - A sample of 72 million tweets collected from Twitter in 2014,
        divided roughly into languages using automatic language detection
      - Frequencies extracted from OpenSubtitles
      - The Leeds Internet Corpus
+      - Google Books Ngrams and Google Books Syntactic Ngrams

    Another available wordlist is 'twitter', which uses only the data from
    Twitter.

    Words that we believe occur at least once per million tokens, based on
    the average of these lists, will appear in the word frequency list.
-    If you look up a word that's not in the list, you'll get the `default`
-    value, which itself defaults to 0.
+
+    The value returned will always be at least as large as `minimum`.

    If a word decomposes into multiple tokens, we'll return a smoothed estimate
    of the word frequency that is no greater than the frequency of any of its
@ -357,12 +251,12 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
    tokens = tokenize(word, lang)

    if len(tokens) == 0:
-        return default
+        return minimum

    for token in tokens:
        if token not in freqs:
            # If any word is missing, just return the default value
-            return default
+            return minimum
        value = freqs[token]
        if combined_value is None:
            combined_value = value
@ -370,11 +264,16 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
            # Combine word values using the half-harmonic-mean formula,
            # (a * b) / (a + b). This operation is associative.
            combined_value = half_harmonic_mean(combined_value, value)
-    return combined_value
+    return max(combined_value, minimum)


@lru_cache(maxsize=100)
 def top_n_list(lang, n, wordlist='combined', ascii_only=False):
+    """
+    Return a frequency list of length `n` in descending order of frequency.
+    This list contains words from `wordlist`, of the given language.
+    If `ascii_only`, then only ascii words are considered.
+    """
    results = []
    for word in iter_wordlist(lang, wordlist):
        if (not ascii_only) or max(word) <= '~':
@ -384,7 +283,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False):
    return results


-def random_words(lang='en', wordlist='combined', nwords=4, bits_per_word=12,
+def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
                 ascii_only=False):
    """
    Returns a string of random, space separated words.
@ -410,7 +309,7 @@ def random_words(lang='en', wordlist='combined', nwords=4, bits_per_word=12,
    return ' '.join(selected)


-def random_ascii_words(lang='en', wordlist='combined', nwords=4,
+def random_ascii_words(lang='en', wordlist='combined', nwords=5,
                       bits_per_word=12):
    """
    Returns a string of random, space separated, ASCII words.
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@ -0,0 +1 @@
+[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -14,8 +14,6 @@ def mecab_tokenize(text):
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
-    parsed_str = MECAB_ANALYZER.parse(text.strip())
-    lines = [line for line in parsed_str.split('\n')
+    return [line.split('\t')[0]
+            for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
            if line != '' and line != 'EOS']
-    tokens = [line.split('\t')[0] for line in lines]
-    return tokens
				`@ -0,0 +1 @@`
				`[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]`