From 54f66d49ee04121da86a2a1aa3400f6bd8cbcdd3 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:13:28 -0400 Subject: [PATCH 01/28] updated tests Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69 --- tests/test.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test.py b/tests/test.py index f02323f..795d533 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,6 +1,7 @@ from wordfreq import ( word_frequency, available_languages, cB_to_freq, iter_wordlist, - top_n_list, random_words, random_ascii_words, tokenize + top_n_list, random_words, random_ascii_words, tokenize, + half_harmonic_mean ) from nose.tools import ( eq_, assert_almost_equal, assert_greater, assert_less, raises @@ -96,7 +97,6 @@ def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data, while the fake word "plan't" can't be found. eq_(tokenize("can't", 'en'), ["can't"]) - eq_(tokenize("plan't", 'en'), ["plan't"]) eq_(tokenize('😂test', 'en'), ['😂', 'test']) @@ -113,8 +113,13 @@ def test_casefolding(): def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) - assert_less(plant, word_frequency('plan', 'en')) - assert_less(plant, word_frequency('t', 'en')) + assert_almost_equal( + plant, + half_harmonic_mean( + word_frequency('plan', 'en'), + word_frequency('t', 'en') + ) + ) def test_not_really_random(): From 0e610fb601d42ffa36cf9897629b9a81d887590e Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:37:31 -0400 Subject: [PATCH 02/28] factored out emoji regex Former-commit-id: 781a0727135c70bace7b6374a5d92cdd14da5901 --- wordfreq/__init__.py | 116 +++++++--------------------------------- wordfreq/data/emoji.txt | 1 + 2 files changed, 19 insertions(+), 98 deletions(-) create mode 100644 wordfreq/data/emoji.txt diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 8451fd4..f380ede 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -19,20 +19,13 @@ CACHE_SIZE = 100000 def _emoji_char_class(): """ Build a regex for emoji substitution. First we create a regex character set - (like "[a-cv-z]") matching characters we consider emoji (see the docstring - of _replace_problem_text()). The final regex matches one such character - followed by any number of spaces and identical characters. + (like "[a-cv-z]") matching characters we consider emoji. The final regex + matches one such character followed by any number of spaces and identical + characters. """ - ranges = [] - for i, c in enumerate(chardata.CHAR_CLASS_STRING): - if c == '3' and i >= 0x2600 and i != 0xfffd: - if ranges and i == ranges[-1][1] + 1: - ranges[-1][1] = i - else: - ranges.append([i, i]) - return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) - -EMOJI_RANGE = _emoji_char_class() + non_punct_file = DATA_PATH / 'emoji.txt' + with non_punct_file.open() as file: + return file.read() def _non_punct_class(): """ @@ -46,91 +39,20 @@ def _non_punct_class(): want to treat emoji separately should filter them out first. """ non_punct_file = DATA_PATH / 'non_punct.txt' - try: - with non_punct_file.open() as file: - return file.read() - except FileNotFoundError: - - out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') - - with non_punct_file.open(mode='w') as file: - file.write(out) - - return out + with non_punct_file.open() as file: + return file.read() def _combining_mark_class(): """ Builds a regex that matches anything that is a combining mark """ - _combining_mark_file = DATA_PATH / 'combining_mark.txt' - try: - with _combining_mark_file.open() as file: - return file.read() - except FileNotFoundError: - - out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') - - with _combining_mark_file.open(mode='w') as file: - file.write(out) - - return out - - -def func_to_ranges(accept): - """ - Converts a function that accepts a single unicode character into a list of - ranges. Unassigned unicode are automatically accepted. - """ - ranges = [] - start = None - for x in range(0x110000): - cat = unicodedata.category(chr(x)) - if cat == 'Cn' or accept(chr(x)): - if start is None: - start = x - else: - if start is not None: - ranges.append((start, x-1)) - start = None - - if start is not None: - ranges.append((start, x)) - - return ranges - -unassigned_ranges = None - -def func_to_regex(accept): - """ - Converts a function that accepts a single unicode character into a regex. - Unassigned unicode characters are treated like their neighbors. - """ - ranges = [] - start = None - for x in range(0x110000): - cat = unicodedata.category(chr(x)) - if cat == 'Cn' or accept(chr(x)): - if start is None: - start = x - else: - if start is not None: - ranges.append((start, x-1)) - start = None - - if start is not None: - ranges.append((start, x)) - - global unassigned_ranges - if unassigned_ranges is None: - unassigned_ranges = set(func_to_ranges(lambda _: False)) - - ranges = [range for range in ranges if range not in unassigned_ranges] - - return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end)) - for start, end in ranges) - + combining_mark_file = DATA_PATH / 'combining_mark.txt' + with combining_mark_file.open() as file: + return file.read() COMBINING_MARK_RE = re.compile(_combining_mark_class()) + +EMOJI_RANGE = _emoji_char_class() NON_PUNCT_RANGE = _non_punct_class() TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) @@ -169,13 +91,11 @@ def tokenize(text, lang): if mecab_tokenize is None: from wordfreq.mecab import mecab_tokenize return mecab_tokenize(text) - elif lang == 'ar': - tokens = simple_tokenize(text) - tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel - tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens] - return [token for token in tokens if token] # remove empty strings - else: - return simple_tokenize(text) + + if lang == 'ar': + text = COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + + return simple_tokenize(text) def read_cBpack(filename): diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt new file mode 100644 index 0000000..f09f7b9 --- /dev/null +++ b/wordfreq/data/emoji.txt @@ -0,0 +1 @@ +[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿] \ No newline at end of file From 6deced5244324fec3455fad7706357d4bce00eb9 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:38:21 -0400 Subject: [PATCH 03/28] factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 --- scripts/gen_regex.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/gen_regex.py diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py new file mode 100644 index 0000000..7f1ce9f --- /dev/null +++ b/scripts/gen_regex.py @@ -0,0 +1,39 @@ +import argparse +import unicodedata + +def func_to_regex(accept): + """ + Converts a function that accepts a single unicode character into a regex. + Unassigned unicode characters are treated like their neighbors. + """ + ranges = [] + start = None + has_accepted = False + for x in range(0x110000): + c = chr(x) + + if accept(c): + has_accepted = True + if start is None: + start = None + elif unicodedata.category(c) == 'Cn': + if start is None: + start = c + elif start is not None: + if has_accepted: + ranges.append('-'.join([start, chr(x-1)])) + has_accepted = False + start = None + else: + if has_accepted and start is not None: + ranges.append('-'.join([start, chr(x-1)])) + + return '[%s]' % ''.join(ranges) + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Generate a regex matching a function') + parser.add_argument('acceptor', help='an python function that accepts a single char') + args = parser.parse_args() + print(func_to_regex(eval(args.acceptor))) From 693849b02dbf9a83227b6323d7058e2e0e09a77a Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:44:50 -0400 Subject: [PATCH 04/28] imports are already cached Former-commit-id: b1cd2e01d366defcaf047692fc941bb357bead14 --- wordfreq/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index f380ede..9697238 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -73,7 +73,6 @@ def simple_tokenize(text): """ return [token.casefold() for token in TOKEN_RE.findall(text)] -mecab_tokenize = None def tokenize(text, lang): """ Tokenize this text in a way that's straightforward but appropriate for @@ -87,10 +86,7 @@ def tokenize(text, lang): first, so that they can be expected to match the data. """ if lang == 'ja': - global mecab_tokenize - if mecab_tokenize is None: - from wordfreq.mecab import mecab_tokenize - return mecab_tokenize(text) + from wordfreq.mecab import mecab_tokenize if lang == 'ar': text = COMBINING_MARK_RE.sub('', text.replace('ـ', '')) From 20c4930435683ceab429edeaae3928f9d5d5b29e Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:46:42 -0400 Subject: [PATCH 05/28] updated imports Former-commit-id: f2b615b0f04d409a2a2bcf46433580a2dbea7fc5 --- scripts/gen_regex.py | 17 +++++++++++++++++ wordfreq/__init__.py | 1 + 2 files changed, 18 insertions(+) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 7f1ce9f..4391f3a 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -1,5 +1,22 @@ import argparse import unicodedata +import chardata + +def _emoji_char_class(): + """ + Build a regex for emoji substitution. First we create a regex character set + (like "[a-cv-z]") matching characters we consider emoji The final regex + matches one such character followed by any number of spaces and identical + characters. + """ + ranges = [] + for i, c in enumerate(chardata.CHAR_CLASS_STRING): + if c == '3' and i >= 0x2600 and i != 0xfffd: + if ranges and i == ranges[-1][1] + 1: + ranges[-1][1] = i + else: + ranges.append([i, i]) + return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) def func_to_regex(accept): """ diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 9697238..a3d0cd0 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -87,6 +87,7 @@ def tokenize(text, lang): """ if lang == 'ja': from wordfreq.mecab import mecab_tokenize + return mecab_tokenize(text) if lang == 'ar': text = COMBINING_MARK_RE.sub('', text.replace('ـ', '')) From 22889de63cefea7f6a423bc8c1698a6e9392dcc2 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:48:11 -0400 Subject: [PATCH 06/28] removed unused imports Former-commit-id: f3f9a654ead1aac26f4a773e9b30fe242218321b --- wordfreq/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index a3d0cd0..b6e7c5b 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -1,9 +1,6 @@ from pkg_resources import resource_filename from functools import lru_cache -import unicodedata -from ftfy import chardata import langcodes -import itertools import msgpack import re import gzip From b81c04a182b83a92b2e22c046fe6565fe7052427 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:50:56 -0400 Subject: [PATCH 07/28] updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 --- scripts/gen_regex.py | 47 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 4391f3a..280489c 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -1,6 +1,9 @@ import argparse import unicodedata import chardata +import pathlib + +DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) def _emoji_char_class(): """ @@ -9,6 +12,8 @@ def _emoji_char_class(): matches one such character followed by any number of spaces and identical characters. """ + emoji_file = DATA_PATH / 'emoji.txt' + ranges = [] for i, c in enumerate(chardata.CHAR_CLASS_STRING): if c == '3' and i >= 0x2600 and i != 0xfffd: @@ -16,7 +21,38 @@ def _emoji_char_class(): ranges[-1][1] = i else: ranges.append([i, i]) - return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) + out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) + + with emoji_file.open(mode='w') as file: + file.write(out) + +def _non_punct_class(): + """ + Builds a regex that matches anything that is not a one of the following + classes: + - P: punctuation + - S: symbols + - Z: separators + - C: control characters + This will classify symbols, including emoji, as punctuation; callers that + want to treat emoji separately should filter them out first. + """ + non_punct_file = DATA_PATH / 'non_punct.txt + + out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') + + with non_punct_file.open(mode='w') as file: + file.write(out) + +def _combining_mark_class(): + """ + Builds a regex that matches anything that is a combining mark + """ + combining_mark_file = DATA_PATH / 'combining_mark.txt' + out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') + + with _combining_mark_file.open(mode='w') as file: + file.write(out) def func_to_regex(accept): """ @@ -48,9 +84,6 @@ def func_to_regex(accept): return '[%s]' % ''.join(ranges) if __name__ == '__main__': - import argparse - - parser = argparse.ArgumentParser(description='Generate a regex matching a function') - parser.add_argument('acceptor', help='an python function that accepts a single char') - args = parser.parse_args() - print(func_to_regex(eval(args.acceptor))) + _combining_mark_class() + _non_punct_class() + _emoji_char_class() From e03de80278104deac7878e3c9b23ea345dc6258c Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:51:46 -0400 Subject: [PATCH 08/28] fixed Error string Former-commit-id: bbdc06452841f9ebf445c5efa93fd873c39e5fe4 --- wordfreq/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index b6e7c5b..8b5ee4d 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -198,7 +198,7 @@ def cB_to_freq(cB): """ if cB > 0: raise ValueError( - "A frequency cannot be a positive number of decibels." + "A frequency cannot be a positive number of centibels." ) return 10 ** (cB / 100) From 512ab7930212c6093ebbd6d99f4fa985bd03375f Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:54:19 -0400 Subject: [PATCH 09/28] use itertools.chain Former-commit-id: 6a40e630608b88f58586982b21ba9d133f6f8348 --- wordfreq/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 8b5ee4d..a65ff96 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -4,6 +4,7 @@ import langcodes import msgpack import re import gzip +import itertools import pathlib import random import logging @@ -226,8 +227,7 @@ def iter_wordlist(lang, wordlist='combined'): with the same rounded frequency, appearing in alphabetical order within each band. """ - for sublist in get_frequency_list(lang, wordlist): - yield from sublist + return itertools.chain(*get_frequency_list(lang, wordlist)) def half_harmonic_mean(a, b): From cec9e23aea34b625980badb7d72c232fd0fb413d Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:55:13 -0400 Subject: [PATCH 10/28] run cB_to_freq only once per bucket Former-commit-id: 5e8ef19321447df7d9b3af5ba7e0764579b33930 --- wordfreq/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index a65ff96..a895fbc 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -213,8 +213,9 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30): freqs = {} pack = get_frequency_list(lang, wordlist, match_cutoff) for index, bucket in enumerate(pack): + freq = cB_to_freq(-index) for word in bucket: - freqs[word] = cB_to_freq(-index) + freqs[word] = freq return freqs From 1f7c53b3dda99a9a7536cce14a1a4b41ac7108b9 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:56:12 -0400 Subject: [PATCH 11/28] updated word_frequency docstring Former-commit-id: 4304a400f78af6bd44508fb25ae9e4af5502e5a4 --- wordfreq/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index a895fbc..800fcee 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -247,13 +247,14 @@ def word_frequency(word, lang, wordlist='combined', default=0.): """ Get the frequency of `word` in the language with code `lang`, from the specified `wordlist`. The default wordlist is 'combined', built from - whichever of these four sources have sufficient data for the language: + whichever of these five sources have sufficient data for the language: - Full text of Wikipedia - A sample of 72 million tweets collected from Twitter in 2014, divided roughly into languages using automatic language detection - Frequencies extracted from OpenSubtitles - The Leeds Internet Corpus + - Google Books Ngrams and Google Books Syntactic Ngrams Another available wordlist is 'twitter', which uses only the data from Twitter. From 7b161644087cba403be1749f1ed706a7940a431d Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:56:40 -0400 Subject: [PATCH 12/28] updated number of words to 5 Former-commit-id: 4b49b1a54790863eed0a977027a01ce6dd59b0f5 --- wordfreq/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 800fcee..574e9e2 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -300,7 +300,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False): return results -def random_words(lang='en', wordlist='combined', nwords=4, bits_per_word=12, +def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12, ascii_only=False): """ Returns a string of random, space separated words. @@ -326,7 +326,7 @@ def random_words(lang='en', wordlist='combined', nwords=4, bits_per_word=12, return ' '.join(selected) -def random_ascii_words(lang='en', wordlist='combined', nwords=4, +def random_ascii_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12): """ Returns a string of random, space separated, ASCII words. From d3a6b5413ed16bbec9d84cf6db29844e1c3d1e2c Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:57:45 -0400 Subject: [PATCH 13/28] removed intermediate lists Former-commit-id: 5342ea30335b3cf43a17494892bdbc0b2950341a --- wordfreq/mecab.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index 85d6060..379255b 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -14,8 +14,6 @@ def mecab_tokenize(text): contains the same table that the command-line version of MeCab would output. We find the tokens in the first column of this table. """ - parsed_str = MECAB_ANALYZER.parse(text.strip()) - lines = [line for line in parsed_str.split('\n') - if line != '' and line != 'EOS'] - tokens = [line.split('\t')[0] for line in lines] - return tokens + return [line.split('\t')[0] + for line in MECAB_ANALYZER.parse(text.strip()).split('\n') + if line != '' and line != 'EOS'] From 589bb624afedb8557b520553f5942979150b623d Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:58:50 -0400 Subject: [PATCH 14/28] updated _emoji_char_class docstring Former-commit-id: 10b5727725faaba26f87445b08731fbe1ec7483e --- scripts/gen_regex.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 280489c..5907b09 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -16,6 +16,7 @@ def _emoji_char_class(): ranges = [] for i, c in enumerate(chardata.CHAR_CLASS_STRING): + # c represents the character class (3 corresponds to emoji) if c == '3' and i >= 0x2600 and i != 0xfffd: if ranges and i == ranges[-1][1] + 1: ranges[-1][1] = i From f1e71839ea467555600a90fbd25d1fd3f5509d5b Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:59:28 -0400 Subject: [PATCH 15/28] fix grammar Former-commit-id: bd172594d34716ba98f66756f10e0ec9b3b952b9 --- scripts/gen_regex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 5907b09..5340933 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -8,7 +8,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) def _emoji_char_class(): """ Build a regex for emoji substitution. First we create a regex character set - (like "[a-cv-z]") matching characters we consider emoji The final regex + (like "[a-cv-z]") matching characters we consider emoji. The final regex matches one such character followed by any number of spaces and identical characters. """ From 16494f18697cae73b41fcf5f2dd31eee9e983ab1 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:01:39 -0400 Subject: [PATCH 16/28] added docstring to top_n_list Former-commit-id: 0b25caaf243a9f92f8106282d5d7d54dd4214d9c --- wordfreq/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 574e9e2..6430ee2 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -291,6 +291,11 @@ def word_frequency(word, lang, wordlist='combined', default=0.): @lru_cache(maxsize=100) def top_n_list(lang, n, wordlist='combined', ascii_only=False): + """ + Return a frequency list of length `n` in descending order of frequency. + This list contains words from `wordlist`, of the given language. + If `ascii_only`, then only ascii words are considered. + """ results = [] for word in iter_wordlist(lang, wordlist): if (not ascii_only) or max(word) <= '~': From d88470df4e786f6fdd1b55569cec913fb0b9bc47 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:03:26 -0400 Subject: [PATCH 17/28] changed default to minimum for word_frequency Former-commit-id: 9aa773aa2bba694c691d1ea7b18e16a64fe7695e --- tests/test.py | 2 +- wordfreq/__init__.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test.py b/tests/test.py index 795d533..358abd4 100644 --- a/tests/test.py +++ b/tests/test.py @@ -46,7 +46,7 @@ def test_twitter(): def test_defaults(): eq_(word_frequency('esquivalience', 'en'), 0) - eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6) + eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) def test_most_common_words(): diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 6430ee2..f7a1948 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -243,7 +243,7 @@ def half_harmonic_mean(a, b): @lru_cache(maxsize=CACHE_SIZE) -def word_frequency(word, lang, wordlist='combined', default=0.): +def word_frequency(word, lang, wordlist='combined', minimum=0.): """ Get the frequency of `word` in the language with code `lang`, from the specified `wordlist`. The default wordlist is 'combined', built from @@ -261,7 +261,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.): Words that we believe occur at least once per million tokens, based on the average of these lists, will appear in the word frequency list. - If you look up a word that's not in the list, you'll get the `default` + If you look up a word that's not in the list, you'll get the `minimum` value, which itself defaults to 0. If a word decomposes into multiple tokens, we'll return a smoothed estimate @@ -273,12 +273,12 @@ def word_frequency(word, lang, wordlist='combined', default=0.): tokens = tokenize(word, lang) if len(tokens) == 0: - return default + return minimum for token in tokens: if token not in freqs: # If any word is missing, just return the default value - return default + return minimum value = freqs[token] if combined_value is None: combined_value = value From 53323f8ea7b44b5f8c2a61caf749cac2a589c620 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:10:59 -0400 Subject: [PATCH 18/28] added arabic tests Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819 --- tests/test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test.py b/tests/test.py index 358abd4..afdefae 100644 --- a/tests/test.py +++ b/tests/test.py @@ -137,3 +137,14 @@ def test_not_really_random(): @raises(ValueError) def test_not_enough_ascii(): random_ascii_words(lang='zh') + +def test_ar(): + eq_( + tokenize('متــــــــعب', 'ar'), + ['متعب'] + ) + + eq_( + tokenize('حَرَكَات', 'ar'), + ['حركات'] + ) From aeea503739ef68d20c36c9017e3093f0ee318af9 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:22:04 -0400 Subject: [PATCH 19/28] fixed gen_regex Former-commit-id: 5510fce675c8008ddd28b3070557b5669ab27b5e --- scripts/gen_regex.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 5340933..fb94f17 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -1,7 +1,8 @@ import argparse import unicodedata -import chardata +from ftfy import chardata import pathlib +from pkg_resources import resource_filename DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) @@ -38,7 +39,7 @@ def _non_punct_class(): This will classify symbols, including emoji, as punctuation; callers that want to treat emoji separately should filter them out first. """ - non_punct_file = DATA_PATH / 'non_punct.txt + non_punct_file = DATA_PATH / 'non_punct.txt' out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') @@ -52,7 +53,7 @@ def _combining_mark_class(): combining_mark_file = DATA_PATH / 'combining_mark.txt' out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') - with _combining_mark_file.open(mode='w') as file: + with combining_mark_file.open(mode='w') as file: file.write(out) def func_to_regex(accept): @@ -69,7 +70,7 @@ def func_to_regex(accept): if accept(c): has_accepted = True if start is None: - start = None + start = c elif unicodedata.category(c) == 'Cn': if start is None: start = c From 950e41c8bb6039bd36e7c80fb2dd4b199c3359b5 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:23:15 -0400 Subject: [PATCH 20/28] fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 --- scripts/gen_regex.py | 6 ++++++ wordfreq/__init__.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index fb94f17..1c5f6f5 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -4,8 +4,10 @@ from ftfy import chardata import pathlib from pkg_resources import resource_filename + DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) + def _emoji_char_class(): """ Build a regex for emoji substitution. First we create a regex character set @@ -28,6 +30,7 @@ def _emoji_char_class(): with emoji_file.open(mode='w') as file: file.write(out) + def _non_punct_class(): """ Builds a regex that matches anything that is not a one of the following @@ -46,6 +49,7 @@ def _non_punct_class(): with non_punct_file.open(mode='w') as file: file.write(out) + def _combining_mark_class(): """ Builds a regex that matches anything that is a combining mark @@ -56,6 +60,7 @@ def _combining_mark_class(): with combining_mark_file.open(mode='w') as file: file.write(out) + def func_to_regex(accept): """ Converts a function that accepts a single unicode character into a regex. @@ -85,6 +90,7 @@ def func_to_regex(accept): return '[%s]' % ''.join(ranges) + if __name__ == '__main__': _combining_mark_class() _non_punct_class() diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index f7a1948..9cc1b8d 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -14,6 +14,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) CACHE_SIZE = 100000 + def _emoji_char_class(): """ Build a regex for emoji substitution. First we create a regex character set @@ -25,6 +26,7 @@ def _emoji_char_class(): with non_punct_file.open() as file: return file.read() + def _non_punct_class(): """ Builds a regex that matches anything that is not a one of the following @@ -40,6 +42,7 @@ def _non_punct_class(): with non_punct_file.open() as file: return file.read() + def _combining_mark_class(): """ Builds a regex that matches anything that is a combining mark @@ -55,6 +58,7 @@ NON_PUNCT_RANGE = _non_punct_class() TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) + def simple_tokenize(text): """ A simple tokenizer that can be applied to most languages. @@ -71,6 +75,7 @@ def simple_tokenize(text): """ return [token.casefold() for token in TOKEN_RE.findall(text)] + def tokenize(text, lang): """ Tokenize this text in a way that's straightforward but appropriate for From 5772f1702db49250def1f75ae8556d06fc8fbf49 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:33:36 -0400 Subject: [PATCH 21/28] factored out range loading Former-commit-id: 32803b235b6730352002961fe232068751360add --- wordfreq/__init__.py | 44 ++++++-------------------------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 9cc1b8d..1a5f39c 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -14,47 +14,15 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) CACHE_SIZE = 100000 - -def _emoji_char_class(): - """ - Build a regex for emoji substitution. First we create a regex character set - (like "[a-cv-z]") matching characters we consider emoji. The final regex - matches one such character followed by any number of spaces and identical - characters. - """ - non_punct_file = DATA_PATH / 'emoji.txt' - with non_punct_file.open() as file: +def load_range(filename): + with (DATA_PATH / filename).open() as file: return file.read() +EMOJI_RANGE = load_range('emoji.txt') +NON_PUNCT_RANGE = load_range('non_punct.txt') +COMBINING_MARK_RANGE = load_range('combining_mark.txt') -def _non_punct_class(): - """ - Builds a regex that matches anything that is not a one of the following - classes: - - P: punctuation - - S: symbols - - Z: separators - - C: control characters - This will classify symbols, including emoji, as punctuation; callers that - want to treat emoji separately should filter them out first. - """ - non_punct_file = DATA_PATH / 'non_punct.txt' - with non_punct_file.open() as file: - return file.read() - - -def _combining_mark_class(): - """ - Builds a regex that matches anything that is a combining mark - """ - combining_mark_file = DATA_PATH / 'combining_mark.txt' - with combining_mark_file.open() as file: - return file.read() - -COMBINING_MARK_RE = re.compile(_combining_mark_class()) - -EMOJI_RANGE = _emoji_char_class() -NON_PUNCT_RANGE = _non_punct_class() +COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE) TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) From 0589bed362590bfdecff4d261279c18f3c189acf Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:33:51 -0400 Subject: [PATCH 22/28] updated docstring Former-commit-id: 9b851f3afe91177b2853c3498ff0d6b0eb7c42f8 --- scripts/gen_regex.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 1c5f6f5..ea50186 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -10,10 +10,8 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) def _emoji_char_class(): """ - Build a regex for emoji substitution. First we create a regex character set - (like "[a-cv-z]") matching characters we consider emoji. The final regex - matches one such character followed by any number of spaces and identical - characters. + Build a regex for emoji substitution. We create a regex character set + (like "[a-cv-z]") matching characters we consider emoji. """ emoji_file = DATA_PATH / 'emoji.txt' From a5dc6eb5fce48f71020948585ef9207773afd494 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:43:34 -0400 Subject: [PATCH 23/28] updated emoji parser Former-commit-id: f04ca8fc9e40b5d8bfb1563414fc4a15a8c8edb0 --- scripts/gen_regex.py | 17 ++++++----------- wordfreq/data/emoji.txt | 2 +- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index ea50186..1a32ac7 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -15,18 +15,13 @@ def _emoji_char_class(): """ emoji_file = DATA_PATH / 'emoji.txt' - ranges = [] - for i, c in enumerate(chardata.CHAR_CLASS_STRING): - # c represents the character class (3 corresponds to emoji) - if c == '3' and i >= 0x2600 and i != 0xfffd: - if ranges and i == ranges[-1][1] + 1: - ranges[-1][1] = i - else: - ranges.append([i, i]) - out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) + def accept(c): + x = ord(c) + return chardata.CHAR_CLASS_STRING[x] == '3' and \ + x >= 0x2600 and x != 0xfffd - with emoji_file.open(mode='w') as file: - file.write(out) + with (DATA_PATH / 'emoji.txt').open(mode='w') as file: + file.write(func_to_regex(accept)) def _non_punct_class(): diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt index f09f7b9..15c56fb 100644 --- a/wordfreq/data/emoji.txt +++ b/wordfreq/data/emoji.txt @@ -1 +1 @@ -[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿] \ No newline at end of file +[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿¦-¦￧-│■-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿] \ No newline at end of file From 927aaae920426fed6bdd6b38bbe21444edca896b Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:46:33 -0400 Subject: [PATCH 24/28] updated minimum Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb --- tests/test.py | 4 ++-- wordfreq/__init__.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test.py b/tests/test.py index afdefae..0a45450 100644 --- a/tests/test.py +++ b/tests/test.py @@ -44,10 +44,10 @@ def test_twitter(): word_frequency('rt', lang, 'combined')) -def test_defaults(): +def test_minimums(): eq_(word_frequency('esquivalience', 'en'), 0) eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) - + eq_(word_frequency('the', 'en', minimum=1), 1) def test_most_common_words(): # If something causes the most common words in well-supported languages to diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 1a5f39c..5f2896a 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -234,8 +234,8 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): Words that we believe occur at least once per million tokens, based on the average of these lists, will appear in the word frequency list. - If you look up a word that's not in the list, you'll get the `minimum` - value, which itself defaults to 0. + + The value returned will always be at least as large as `minimum`. If a word decomposes into multiple tokens, we'll return a smoothed estimate of the word frequency that is no greater than the frequency of any of its @@ -259,7 +259,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): # Combine word values using the half-harmonic-mean formula, # (a * b) / (a + b). This operation is associative. combined_value = half_harmonic_mean(combined_value, value) - return combined_value + return max(combined_value, minimum) @lru_cache(maxsize=100) From 993bc4da15a007e988f79df8ae21e24175cd6475 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:47:37 -0400 Subject: [PATCH 25/28] revert to using global mecab_tokenize variable Former-commit-id: 189a5b9cd6bd36857c73ffd0bef86e38bc40da16 --- wordfreq/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 5f2896a..bf68f14 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -43,7 +43,7 @@ def simple_tokenize(text): """ return [token.casefold() for token in TOKEN_RE.findall(text)] - +mecab_tokenize = None def tokenize(text, lang): """ Tokenize this text in a way that's straightforward but appropriate for @@ -57,7 +57,9 @@ def tokenize(text, lang): first, so that they can be expected to match the data. """ if lang == 'ja': - from wordfreq.mecab import mecab_tokenize + global mecab_tokenize + if mecab_tokenize is None: + from wordfreq.mecab import mecab_tokenize return mecab_tokenize(text) if lang == 'ar': From 4d3123e2eedf465b5f47ad9cd7afdeecc236319a Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 16:00:24 -0400 Subject: [PATCH 26/28] cleaned up gen regex Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2 --- scripts/gen_regex.py | 45 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 1a32ac7..9801a73 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -8,25 +8,31 @@ from pkg_resources import resource_filename DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) +def cache_regex_from_func(filename, func): + """ + Generates a regex from a function that accepts a single unicode character, + and caches it in the data path at filename. + """ + with (DATA_PATH / filename).open(mode='w') as file: + file.write(func_to_regex(func)) + + def _emoji_char_class(): """ Build a regex for emoji substitution. We create a regex character set (like "[a-cv-z]") matching characters we consider emoji. """ - emoji_file = DATA_PATH / 'emoji.txt' - - def accept(c): - x = ord(c) - return chardata.CHAR_CLASS_STRING[x] == '3' and \ - x >= 0x2600 and x != 0xfffd - - with (DATA_PATH / 'emoji.txt').open(mode='w') as file: - file.write(func_to_regex(accept)) + cache_regex_from_func( + 'emoji.txt', + lambda c: + chardata.CHAR_CLASS_STRING[ord(c)] == '3' and + c >= '\u2600' and c != '\ufffd' + ) def _non_punct_class(): """ - Builds a regex that matches anything that is not a one of the following + Builds a regex that matches anything that is not one of the following classes: - P: punctuation - S: symbols @@ -35,23 +41,20 @@ def _non_punct_class(): This will classify symbols, including emoji, as punctuation; callers that want to treat emoji separately should filter them out first. """ - non_punct_file = DATA_PATH / 'non_punct.txt' - - out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') - - with non_punct_file.open(mode='w') as file: - file.write(out) + cache_regex_from_func( + 'non_punct.txt', + lambda c: unicodedata.category(c)[0] not in 'PSZC' + ) def _combining_mark_class(): """ Builds a regex that matches anything that is a combining mark """ - combining_mark_file = DATA_PATH / 'combining_mark.txt' - out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') - - with combining_mark_file.open(mode='w') as file: - file.write(out) + cache_regex_from_func( + 'combining_mark.txt', + lambda c: unicodedata.category(c)[0] == 'M' + ) def func_to_regex(accept): From 8eac6bf0062c1255d32fa6111adbd4c7a457556c Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 16:00:37 -0400 Subject: [PATCH 27/28] added documentation to load ranges Former-commit-id: af362480d5bdb7395dd482a39898013fb723dd27 --- wordfreq/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index bf68f14..afb1971 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -15,6 +15,9 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) CACHE_SIZE = 100000 def load_range(filename): + """ + Loads a file from the data path + """ with (DATA_PATH / filename).open() as file: return file.read() From b145e02ce4e98b5d3b7076993dae29c57b4c5768 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 16:21:22 -0400 Subject: [PATCH 28/28] removed unused imports Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179 --- scripts/gen_regex.py | 1 - tests/test.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 9801a73..38d4c39 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -1,4 +1,3 @@ -import argparse import unicodedata from ftfy import chardata import pathlib diff --git a/tests/test.py b/tests/test.py index 0a45450..ba52fb8 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,10 +1,10 @@ from wordfreq import ( - word_frequency, available_languages, cB_to_freq, iter_wordlist, + word_frequency, available_languages, cB_to_freq, top_n_list, random_words, random_ascii_words, tokenize, half_harmonic_mean ) from nose.tools import ( - eq_, assert_almost_equal, assert_greater, assert_less, raises + eq_, assert_almost_equal, assert_greater, raises )