From 54f66d49ee04121da86a2a1aa3400f6bd8cbcdd3 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:13:28 -0400
Subject: [PATCH 01/28] updated tests

Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69
---
 tests/test.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index f02323f..795d533 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,6 +1,7 @@
 from wordfreq import (
     word_frequency, available_languages, cB_to_freq, iter_wordlist,
-    top_n_list, random_words, random_ascii_words, tokenize
+    top_n_list, random_words, random_ascii_words, tokenize,
+    half_harmonic_mean
 )
 from nose.tools import (
     eq_, assert_almost_equal, assert_greater, assert_less, raises
@@ -96,7 +97,6 @@ def test_tokenization():
     # We preserve apostrophes within words, so "can't" is a single word in the
     # data, while the fake word "plan't" can't be found.
     eq_(tokenize("can't", 'en'), ["can't"])
-    eq_(tokenize("plan't", 'en'), ["plan't"])
 
     eq_(tokenize('😂test', 'en'), ['😂', 'test'])
 
@@ -113,8 +113,13 @@ def test_casefolding():
 def test_phrase_freq():
     plant = word_frequency("plan.t", 'en')
     assert_greater(plant, 0)
-    assert_less(plant, word_frequency('plan', 'en'))
-    assert_less(plant, word_frequency('t', 'en'))
+    assert_almost_equal(
+        plant,
+        half_harmonic_mean(
+            word_frequency('plan', 'en'),
+            word_frequency('t', 'en')
+            )
+        )
 
 
 def test_not_really_random():

From 0e610fb601d42ffa36cf9897629b9a81d887590e Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:37:31 -0400
Subject: [PATCH 02/28] factored out emoji regex

Former-commit-id: 781a0727135c70bace7b6374a5d92cdd14da5901
---
 wordfreq/__init__.py    | 116 +++++++---------------------------------
 wordfreq/data/emoji.txt |   1 +
 2 files changed, 19 insertions(+), 98 deletions(-)
 create mode 100644 wordfreq/data/emoji.txt

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 8451fd4..f380ede 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -19,20 +19,13 @@ CACHE_SIZE = 100000
 def _emoji_char_class():
     """
     Build a regex for emoji substitution.  First we create a regex character set
-    (like "[a-cv-z]") matching characters we consider emoji (see the docstring
-    of _replace_problem_text()).  The final regex matches one such character
-    followed by any number of spaces and identical characters.
+    (like "[a-cv-z]") matching characters we consider emoji. The final regex
+    matches one such character followed by any number of spaces and identical
+    characters.
     """
-    ranges = []
-    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
-        if c == '3' and i >= 0x2600 and i != 0xfffd:
-            if ranges and i == ranges[-1][1] + 1:
-                ranges[-1][1] = i
-            else:
-                ranges.append([i, i])
-    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
-
-EMOJI_RANGE = _emoji_char_class()
+    non_punct_file = DATA_PATH / 'emoji.txt'
+    with non_punct_file.open() as file:
+        return file.read()
 
 def _non_punct_class():
     """
@@ -46,91 +39,20 @@ def _non_punct_class():
     want to treat emoji separately should filter them out first.
     """
     non_punct_file = DATA_PATH / 'non_punct.txt'
-    try:
-        with non_punct_file.open() as file:
-            return file.read()
-    except FileNotFoundError:
-
-        out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
-
-        with non_punct_file.open(mode='w') as file:
-            file.write(out)
-
-        return out
+    with non_punct_file.open() as file:
+        return file.read()
 
 def _combining_mark_class():
     """
     Builds a regex that matches anything that is a combining mark
     """
-    _combining_mark_file = DATA_PATH / 'combining_mark.txt'
-    try:
-        with _combining_mark_file.open() as file:
-            return file.read()
-    except FileNotFoundError:
-
-        out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
-
-        with _combining_mark_file.open(mode='w') as file:
-            file.write(out)
-
-        return out
-
-
-def func_to_ranges(accept):
-    """
-    Converts a function that accepts a single unicode character into a list of
-    ranges. Unassigned unicode are automatically accepted.
-    """
-    ranges = []
-    start = None
-    for x in range(0x110000):
-        cat = unicodedata.category(chr(x))
-        if cat == 'Cn' or accept(chr(x)):
-            if start is None:
-                start = x
-        else:
-            if start is not None:
-                ranges.append((start, x-1))
-                start = None
-
-    if start is not None:
-        ranges.append((start, x))
-
-    return ranges
-
-unassigned_ranges = None
-
-def func_to_regex(accept):
-    """
-    Converts a function that accepts a single unicode character into a regex.
-    Unassigned unicode characters are treated like their neighbors.
-    """
-    ranges = []
-    start = None
-    for x in range(0x110000):
-        cat = unicodedata.category(chr(x))
-        if cat == 'Cn' or accept(chr(x)):
-            if start is None:
-                start = x
-        else:
-            if start is not None:
-                ranges.append((start, x-1))
-                start = None
-
-    if start is not None:
-        ranges.append((start, x))
-
-    global unassigned_ranges
-    if unassigned_ranges is None:
-        unassigned_ranges = set(func_to_ranges(lambda _: False))
-
-    ranges = [range for range in ranges if range not in unassigned_ranges]
-
-    return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
-                                for start, end in ranges)
-
+    combining_mark_file = DATA_PATH / 'combining_mark.txt'
+    with combining_mark_file.open() as file:
+        return file.read()
 
 COMBINING_MARK_RE = re.compile(_combining_mark_class())
+
+EMOJI_RANGE = _emoji_char_class()
 NON_PUNCT_RANGE = _non_punct_class()
 
 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
@@ -169,13 +91,11 @@ def tokenize(text, lang):
         if mecab_tokenize is None:
             from wordfreq.mecab import mecab_tokenize
         return mecab_tokenize(text)
-    elif lang == 'ar':
-        tokens = simple_tokenize(text)
-        tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel
-        tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens]
-        return [token for token in tokens if token] # remove empty strings
-    else:
-        return simple_tokenize(text)
+
+    if lang == 'ar':
+        text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+
+    return simple_tokenize(text)
 
 
 def read_cBpack(filename):
diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt
new file mode 100644
index 0000000..f09f7b9
--- /dev/null
+++ b/wordfreq/data/emoji.txt
@@ -0,0 +1 @@
+[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮￼-￼𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿]
\ No newline at end of file

From 6deced5244324fec3455fad7706357d4bce00eb9 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:38:21 -0400
Subject: [PATCH 03/28] factored out regex generation

Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0
---
 scripts/gen_regex.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 scripts/gen_regex.py

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
new file mode 100644
index 0000000..7f1ce9f
--- /dev/null
+++ b/scripts/gen_regex.py
@@ -0,0 +1,39 @@
+import argparse
+import unicodedata
+
+def func_to_regex(accept):
+    """
+    Converts a function that accepts a single unicode character into a regex.
+    Unassigned unicode characters are treated like their neighbors.
+    """
+    ranges = []
+    start = None
+    has_accepted = False
+    for x in range(0x110000):
+        c = chr(x)
+
+        if accept(c):
+            has_accepted = True
+            if start is None:
+                start = None
+        elif unicodedata.category(c) == 'Cn':
+            if start is None:
+                start = c
+        elif start is not None:
+            if has_accepted:
+                ranges.append('-'.join([start, chr(x-1)]))
+                has_accepted = False
+            start = None
+    else:
+        if has_accepted and start is not None:
+            ranges.append('-'.join([start, chr(x-1)]))
+
+    return '[%s]' % ''.join(ranges)
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Generate a regex matching a function')
+    parser.add_argument('acceptor', help='an python function that accepts a single char')
+    args = parser.parse_args()
+    print(func_to_regex(eval(args.acceptor)))

From 693849b02dbf9a83227b6323d7058e2e0e09a77a Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:44:50 -0400
Subject: [PATCH 04/28] imports are already cached

Former-commit-id: b1cd2e01d366defcaf047692fc941bb357bead14
---
 wordfreq/__init__.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index f380ede..9697238 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -73,7 +73,6 @@ def simple_tokenize(text):
     """
     return [token.casefold() for token in TOKEN_RE.findall(text)]
 
-mecab_tokenize = None
 def tokenize(text, lang):
     """
     Tokenize this text in a way that's straightforward but appropriate for
@@ -87,10 +86,7 @@ def tokenize(text, lang):
     first, so that they can be expected to match the data.
     """
     if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.mecab import mecab_tokenize
-        return mecab_tokenize(text)
+        from wordfreq.mecab import mecab_tokenize
 
     if lang == 'ar':
         text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))

From 20c4930435683ceab429edeaae3928f9d5d5b29e Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:46:42 -0400
Subject: [PATCH 05/28] updated imports

Former-commit-id: f2b615b0f04d409a2a2bcf46433580a2dbea7fc5
---
 scripts/gen_regex.py | 17 +++++++++++++++++
 wordfreq/__init__.py |  1 +
 2 files changed, 18 insertions(+)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 7f1ce9f..4391f3a 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -1,5 +1,22 @@
 import argparse
 import unicodedata
+import chardata
+
+def _emoji_char_class():
+    """
+    Build a regex for emoji substitution.  First we create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji The final regex
+    matches one such character followed by any number of spaces and identical
+    characters.
+    """
+    ranges = []
+    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        if c == '3' and i >= 0x2600 and i != 0xfffd:
+            if ranges and i == ranges[-1][1] + 1:
+                ranges[-1][1] = i
+            else:
+                ranges.append([i, i])
+    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
 
 def func_to_regex(accept):
     """
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 9697238..a3d0cd0 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -87,6 +87,7 @@ def tokenize(text, lang):
     """
     if lang == 'ja':
         from wordfreq.mecab import mecab_tokenize
+        return mecab_tokenize(text)
 
     if lang == 'ar':
         text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))

From 22889de63cefea7f6a423bc8c1698a6e9392dcc2 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:48:11 -0400
Subject: [PATCH 06/28] removed unused imports

Former-commit-id: f3f9a654ead1aac26f4a773e9b30fe242218321b
---
 wordfreq/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index a3d0cd0..b6e7c5b 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -1,9 +1,6 @@
 from pkg_resources import resource_filename
 from functools import lru_cache
-import unicodedata
-from ftfy import chardata
 import langcodes
-import itertools
 import msgpack
 import re
 import gzip

From b81c04a182b83a92b2e22c046fe6565fe7052427 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:50:56 -0400
Subject: [PATCH 07/28] updated gen_regex to be run as script

Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971
---
 scripts/gen_regex.py | 47 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 4391f3a..280489c 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -1,6 +1,9 @@
 import argparse
 import unicodedata
 import chardata
+import pathlib
+
+DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 def _emoji_char_class():
     """
@@ -9,6 +12,8 @@ def _emoji_char_class():
     matches one such character followed by any number of spaces and identical
     characters.
     """
+    emoji_file = DATA_PATH / 'emoji.txt'
+
     ranges = []
     for i, c in enumerate(chardata.CHAR_CLASS_STRING):
         if c == '3' and i >= 0x2600 and i != 0xfffd:
@@ -16,7 +21,38 @@ def _emoji_char_class():
                 ranges[-1][1] = i
             else:
                 ranges.append([i, i])
-    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+    out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+
+    with emoji_file.open(mode='w') as file:
+        file.write(out)
+
+def _non_punct_class():
+    """
+    Builds a regex that matches anything that is not a one of the following
+    classes:
+    - P: punctuation
+    - S: symbols
+    - Z: separators
+    - C: control characters
+    This will classify symbols, including emoji, as punctuation; callers that
+    want to treat emoji separately should filter them out first.
+    """
+    non_punct_file = DATA_PATH / 'non_punct.txt
+
+    out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
+
+    with non_punct_file.open(mode='w') as file:
+        file.write(out)
+
+def _combining_mark_class():
+    """
+    Builds a regex that matches anything that is a combining mark
+    """
+    combining_mark_file = DATA_PATH / 'combining_mark.txt'
+    out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
+
+    with _combining_mark_file.open(mode='w') as file:
+        file.write(out)
 
 def func_to_regex(accept):
     """
@@ -48,9 +84,6 @@ def func_to_regex(accept):
     return '[%s]' % ''.join(ranges)
 
 if __name__ == '__main__':
-    import argparse
-
-    parser = argparse.ArgumentParser(description='Generate a regex matching a function')
-    parser.add_argument('acceptor', help='an python function that accepts a single char')
-    args = parser.parse_args()
-    print(func_to_regex(eval(args.acceptor)))
+    _combining_mark_class()
+    _non_punct_class()
+    _emoji_char_class()

From e03de80278104deac7878e3c9b23ea345dc6258c Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:51:46 -0400
Subject: [PATCH 08/28] fixed Error string

Former-commit-id: bbdc06452841f9ebf445c5efa93fd873c39e5fe4
---
 wordfreq/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index b6e7c5b..8b5ee4d 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -198,7 +198,7 @@ def cB_to_freq(cB):
     """
     if cB > 0:
         raise ValueError(
-            "A frequency cannot be a positive number of decibels."
+            "A frequency cannot be a positive number of centibels."
         )
     return 10 ** (cB / 100)
 

From 512ab7930212c6093ebbd6d99f4fa985bd03375f Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:54:19 -0400
Subject: [PATCH 09/28] use itertools.chain

Former-commit-id: 6a40e630608b88f58586982b21ba9d133f6f8348
---
 wordfreq/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 8b5ee4d..a65ff96 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -4,6 +4,7 @@ import langcodes
 import msgpack
 import re
 import gzip
+import itertools
 import pathlib
 import random
 import logging
@@ -226,8 +227,7 @@ def iter_wordlist(lang, wordlist='combined'):
     with the same rounded frequency, appearing in alphabetical order within
     each band.
     """
-    for sublist in get_frequency_list(lang, wordlist):
-        yield from sublist
+    return itertools.chain(*get_frequency_list(lang, wordlist))
 
 
 def half_harmonic_mean(a, b):

From cec9e23aea34b625980badb7d72c232fd0fb413d Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:55:13 -0400
Subject: [PATCH 10/28] run cB_to_freq only once per bucket

Former-commit-id: 5e8ef19321447df7d9b3af5ba7e0764579b33930
---
 wordfreq/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index a65ff96..a895fbc 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -213,8 +213,9 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
     freqs = {}
     pack = get_frequency_list(lang, wordlist, match_cutoff)
     for index, bucket in enumerate(pack):
+        freq = cB_to_freq(-index)
         for word in bucket:
-            freqs[word] = cB_to_freq(-index)
+            freqs[word] = freq
     return freqs
 
 

From 1f7c53b3dda99a9a7536cce14a1a4b41ac7108b9 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:56:12 -0400
Subject: [PATCH 11/28] updated word_frequency docstring

Former-commit-id: 4304a400f78af6bd44508fb25ae9e4af5502e5a4
---
 wordfreq/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index a895fbc..800fcee 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -247,13 +247,14 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
     """
     Get the frequency of `word` in the language with code `lang`, from the
     specified `wordlist`. The default wordlist is 'combined', built from
-    whichever of these four sources have sufficient data for the language:
+    whichever of these five sources have sufficient data for the language:
 
       - Full text of Wikipedia
       - A sample of 72 million tweets collected from Twitter in 2014,
         divided roughly into languages using automatic language detection
       - Frequencies extracted from OpenSubtitles
       - The Leeds Internet Corpus
+      - Google Books Ngrams and Google Books Syntactic Ngrams
 
     Another available wordlist is 'twitter', which uses only the data from
     Twitter.

From 7b161644087cba403be1749f1ed706a7940a431d Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:56:40 -0400
Subject: [PATCH 12/28] updated number of words to 5

Former-commit-id: 4b49b1a54790863eed0a977027a01ce6dd59b0f5
---
 wordfreq/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 800fcee..574e9e2 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -300,7 +300,7 @@ def top_n_list(lang, n, wordlist='combined', ascii_only=False):
     return results
 
 
-def random_words(lang='en', wordlist='combined', nwords=4, bits_per_word=12,
+def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
                  ascii_only=False):
     """
     Returns a string of random, space separated words.
@@ -326,7 +326,7 @@ def random_words(lang='en', wordlist='combined', nwords=4, bits_per_word=12,
     return ' '.join(selected)
 
 
-def random_ascii_words(lang='en', wordlist='combined', nwords=4,
+def random_ascii_words(lang='en', wordlist='combined', nwords=5,
                        bits_per_word=12):
     """
     Returns a string of random, space separated, ASCII words.

From d3a6b5413ed16bbec9d84cf6db29844e1c3d1e2c Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:57:45 -0400
Subject: [PATCH 13/28] removed intermediate lists

Former-commit-id: 5342ea30335b3cf43a17494892bdbc0b2950341a
---
 wordfreq/mecab.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py
index 85d6060..379255b 100644
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@@ -14,8 +14,6 @@ def mecab_tokenize(text):
     contains the same table that the command-line version of MeCab would output.
     We find the tokens in the first column of this table.
     """
-    parsed_str = MECAB_ANALYZER.parse(text.strip())
-    lines = [line for line in parsed_str.split('\n')
-             if line != '' and line != 'EOS']
-    tokens = [line.split('\t')[0] for line in lines]
-    return tokens
+    return [line.split('\t')[0]
+            for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
+            if line != '' and line != 'EOS']

From 589bb624afedb8557b520553f5942979150b623d Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:58:50 -0400
Subject: [PATCH 14/28] updated _emoji_char_class docstring

Former-commit-id: 10b5727725faaba26f87445b08731fbe1ec7483e
---
 scripts/gen_regex.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 280489c..5907b09 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -16,6 +16,7 @@ def _emoji_char_class():
 
     ranges = []
     for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        # c represents the character class (3 corresponds to emoji)
         if c == '3' and i >= 0x2600 and i != 0xfffd:
             if ranges and i == ranges[-1][1] + 1:
                 ranges[-1][1] = i

From f1e71839ea467555600a90fbd25d1fd3f5509d5b Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 14:59:28 -0400
Subject: [PATCH 15/28] fix grammar

Former-commit-id: bd172594d34716ba98f66756f10e0ec9b3b952b9
---
 scripts/gen_regex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 5907b09..5340933 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -8,7 +8,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 def _emoji_char_class():
     """
     Build a regex for emoji substitution.  First we create a regex character set
-    (like "[a-cv-z]") matching characters we consider emoji The final regex
+    (like "[a-cv-z]") matching characters we consider emoji. The final regex
     matches one such character followed by any number of spaces and identical
     characters.
     """

From 16494f18697cae73b41fcf5f2dd31eee9e983ab1 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:01:39 -0400
Subject: [PATCH 16/28] added docstring to top_n_list

Former-commit-id: 0b25caaf243a9f92f8106282d5d7d54dd4214d9c
---
 wordfreq/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 574e9e2..6430ee2 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -291,6 +291,11 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
 
 @lru_cache(maxsize=100)
 def top_n_list(lang, n, wordlist='combined', ascii_only=False):
+    """
+    Return a frequency list of length `n` in descending order of frequency.
+    This list contains words from `wordlist`, of the given language.
+    If `ascii_only`, then only ascii words are considered.
+    """
     results = []
     for word in iter_wordlist(lang, wordlist):
         if (not ascii_only) or max(word) <= '~':

From d88470df4e786f6fdd1b55569cec913fb0b9bc47 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:03:26 -0400
Subject: [PATCH 17/28] changed default to minimum for word_frequency

Former-commit-id: 9aa773aa2bba694c691d1ea7b18e16a64fe7695e
---
 tests/test.py        | 2 +-
 wordfreq/__init__.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 795d533..358abd4 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -46,7 +46,7 @@ def test_twitter():
 
 def test_defaults():
     eq_(word_frequency('esquivalience', 'en'), 0)
-    eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6)
+    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
 
 
 def test_most_common_words():
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 6430ee2..f7a1948 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -243,7 +243,7 @@ def half_harmonic_mean(a, b):
 
 
 @lru_cache(maxsize=CACHE_SIZE)
-def word_frequency(word, lang, wordlist='combined', default=0.):
+def word_frequency(word, lang, wordlist='combined', minimum=0.):
     """
     Get the frequency of `word` in the language with code `lang`, from the
     specified `wordlist`. The default wordlist is 'combined', built from
@@ -261,7 +261,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
 
     Words that we believe occur at least once per million tokens, based on
     the average of these lists, will appear in the word frequency list.
-    If you look up a word that's not in the list, you'll get the `default`
+    If you look up a word that's not in the list, you'll get the `minimum`
     value, which itself defaults to 0.
 
     If a word decomposes into multiple tokens, we'll return a smoothed estimate
@@ -273,12 +273,12 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
     tokens = tokenize(word, lang)
 
     if len(tokens) == 0:
-        return default
+        return minimum
 
     for token in tokens:
         if token not in freqs:
             # If any word is missing, just return the default value
-            return default
+            return minimum
         value = freqs[token]
         if combined_value is None:
             combined_value = value

From 53323f8ea7b44b5f8c2a61caf749cac2a589c620 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:10:59 -0400
Subject: [PATCH 18/28] added arabic tests

Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
---
 tests/test.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test.py b/tests/test.py
index 358abd4..afdefae 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -137,3 +137,14 @@ def test_not_really_random():
 @raises(ValueError)
 def test_not_enough_ascii():
     random_ascii_words(lang='zh')
+
+def test_ar():
+    eq_(
+        tokenize('متــــــــعب', 'ar'),
+        ['متعب']
+    )
+
+    eq_(
+        tokenize('حَرَكَات', 'ar'),
+        ['حركات']
+    )

From aeea503739ef68d20c36c9017e3093f0ee318af9 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:22:04 -0400
Subject: [PATCH 19/28] fixed gen_regex

Former-commit-id: 5510fce675c8008ddd28b3070557b5669ab27b5e
---
 scripts/gen_regex.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 5340933..fb94f17 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -1,7 +1,8 @@
 import argparse
 import unicodedata
-import chardata
+from ftfy import chardata
 import pathlib
+from pkg_resources import resource_filename
 
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
@@ -38,7 +39,7 @@ def _non_punct_class():
     This will classify symbols, including emoji, as punctuation; callers that
     want to treat emoji separately should filter them out first.
     """
-    non_punct_file = DATA_PATH / 'non_punct.txt
+    non_punct_file = DATA_PATH / 'non_punct.txt'
 
     out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
 
@@ -52,7 +53,7 @@ def _combining_mark_class():
     combining_mark_file = DATA_PATH / 'combining_mark.txt'
     out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
 
-    with _combining_mark_file.open(mode='w') as file:
+    with combining_mark_file.open(mode='w') as file:
         file.write(out)
 
 def func_to_regex(accept):
@@ -69,7 +70,7 @@ def func_to_regex(accept):
         if accept(c):
             has_accepted = True
             if start is None:
-                start = None
+                start = c
         elif unicodedata.category(c) == 'Cn':
             if start is None:
                 start = c

From 950e41c8bb6039bd36e7c80fb2dd4b199c3359b5 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:23:15 -0400
Subject: [PATCH 20/28] fixed spacing

Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747
---
 scripts/gen_regex.py | 6 ++++++
 wordfreq/__init__.py | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index fb94f17..1c5f6f5 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -4,8 +4,10 @@ from ftfy import chardata
 import pathlib
 from pkg_resources import resource_filename
 
+
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
+
 def _emoji_char_class():
     """
     Build a regex for emoji substitution.  First we create a regex character set
@@ -28,6 +30,7 @@ def _emoji_char_class():
     with emoji_file.open(mode='w') as file:
         file.write(out)
 
+
 def _non_punct_class():
     """
     Builds a regex that matches anything that is not a one of the following
@@ -46,6 +49,7 @@ def _non_punct_class():
     with non_punct_file.open(mode='w') as file:
         file.write(out)
 
+
 def _combining_mark_class():
     """
     Builds a regex that matches anything that is a combining mark
@@ -56,6 +60,7 @@ def _combining_mark_class():
     with combining_mark_file.open(mode='w') as file:
         file.write(out)
 
+
 def func_to_regex(accept):
     """
     Converts a function that accepts a single unicode character into a regex.
@@ -85,6 +90,7 @@ def func_to_regex(accept):
 
     return '[%s]' % ''.join(ranges)
 
+
 if __name__ == '__main__':
     _combining_mark_class()
     _non_punct_class()
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index f7a1948..9cc1b8d 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -14,6 +14,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 CACHE_SIZE = 100000
 
+
 def _emoji_char_class():
     """
     Build a regex for emoji substitution.  First we create a regex character set
@@ -25,6 +26,7 @@ def _emoji_char_class():
     with non_punct_file.open() as file:
         return file.read()
 
+
 def _non_punct_class():
     """
     Builds a regex that matches anything that is not a one of the following
@@ -40,6 +42,7 @@ def _non_punct_class():
     with non_punct_file.open() as file:
         return file.read()
 
+
 def _combining_mark_class():
     """
     Builds a regex that matches anything that is a combining mark
@@ -55,6 +58,7 @@ NON_PUNCT_RANGE = _non_punct_class()
 
 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
 
+
 def simple_tokenize(text):
     """
     A simple tokenizer that can be applied to most languages.
@@ -71,6 +75,7 @@ def simple_tokenize(text):
     """
     return [token.casefold() for token in TOKEN_RE.findall(text)]
 
+
 def tokenize(text, lang):
     """
     Tokenize this text in a way that's straightforward but appropriate for

From 5772f1702db49250def1f75ae8556d06fc8fbf49 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:33:36 -0400
Subject: [PATCH 21/28] factored out range loading

Former-commit-id: 32803b235b6730352002961fe232068751360add
---
 wordfreq/__init__.py | 44 ++++++--------------------------------------
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 9cc1b8d..1a5f39c 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -14,47 +14,15 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 CACHE_SIZE = 100000
 
-
-def _emoji_char_class():
-    """
-    Build a regex for emoji substitution.  First we create a regex character set
-    (like "[a-cv-z]") matching characters we consider emoji. The final regex
-    matches one such character followed by any number of spaces and identical
-    characters.
-    """
-    non_punct_file = DATA_PATH / 'emoji.txt'
-    with non_punct_file.open() as file:
+def load_range(filename):
+    with (DATA_PATH / filename).open() as file:
         return file.read()
 
+EMOJI_RANGE = load_range('emoji.txt')
+NON_PUNCT_RANGE = load_range('non_punct.txt')
+COMBINING_MARK_RANGE = load_range('combining_mark.txt')
 
-def _non_punct_class():
-    """
-    Builds a regex that matches anything that is not a one of the following
-    classes:
-    - P: punctuation
-    - S: symbols
-    - Z: separators
-    - C: control characters
-    This will classify symbols, including emoji, as punctuation; callers that
-    want to treat emoji separately should filter them out first.
-    """
-    non_punct_file = DATA_PATH / 'non_punct.txt'
-    with non_punct_file.open() as file:
-        return file.read()
-
-
-def _combining_mark_class():
-    """
-    Builds a regex that matches anything that is a combining mark
-    """
-    combining_mark_file = DATA_PATH / 'combining_mark.txt'
-    with combining_mark_file.open() as file:
-        return file.read()
-
-COMBINING_MARK_RE = re.compile(_combining_mark_class())
-
-EMOJI_RANGE = _emoji_char_class()
-NON_PUNCT_RANGE = _non_punct_class()
+COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
 
 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
 

From 0589bed362590bfdecff4d261279c18f3c189acf Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:33:51 -0400
Subject: [PATCH 22/28] updated docstring

Former-commit-id: 9b851f3afe91177b2853c3498ff0d6b0eb7c42f8
---
 scripts/gen_regex.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 1c5f6f5..ea50186 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -10,10 +10,8 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 def _emoji_char_class():
     """
-    Build a regex for emoji substitution.  First we create a regex character set
-    (like "[a-cv-z]") matching characters we consider emoji. The final regex
-    matches one such character followed by any number of spaces and identical
-    characters.
+    Build a regex for emoji substitution.  We create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji.
     """
     emoji_file = DATA_PATH / 'emoji.txt'
 

From a5dc6eb5fce48f71020948585ef9207773afd494 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:43:34 -0400
Subject: [PATCH 23/28] updated emoji parser

Former-commit-id: f04ca8fc9e40b5d8bfb1563414fc4a15a8c8edb0
---
 scripts/gen_regex.py    | 17 ++++++-----------
 wordfreq/data/emoji.txt |  2 +-
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index ea50186..1a32ac7 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -15,18 +15,13 @@ def _emoji_char_class():
     """
     emoji_file = DATA_PATH / 'emoji.txt'
 
-    ranges = []
-    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
-        # c represents the character class (3 corresponds to emoji)
-        if c == '3' and i >= 0x2600 and i != 0xfffd:
-            if ranges and i == ranges[-1][1] + 1:
-                ranges[-1][1] = i
-            else:
-                ranges.append([i, i])
-    out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+    def accept(c):
+        x = ord(c)
+        return chardata.CHAR_CLASS_STRING[x] == '3' and \
+                x >= 0x2600 and x != 0xfffd
 
-    with emoji_file.open(mode='w') as file:
-        file.write(out)
+    with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
+        file.write(func_to_regex(accept))
 
 
 def _non_punct_class():
diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt
index f09f7b9..15c56fb 100644
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@@ -1 +1 @@
-[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮￼-￼𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿]
\ No newline at end of file
+[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸￼-￼𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]
\ No newline at end of file

From 927aaae920426fed6bdd6b38bbe21444edca896b Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:46:33 -0400
Subject: [PATCH 24/28] updated minimum

Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
---
 tests/test.py        | 4 ++--
 wordfreq/__init__.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index afdefae..0a45450 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -44,10 +44,10 @@ def test_twitter():
                        word_frequency('rt', lang, 'combined'))
 
 
-def test_defaults():
+def test_minimums():
     eq_(word_frequency('esquivalience', 'en'), 0)
     eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
-
+    eq_(word_frequency('the', 'en', minimum=1), 1)
 
 def test_most_common_words():
     # If something causes the most common words in well-supported languages to
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 1a5f39c..5f2896a 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -234,8 +234,8 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
 
     Words that we believe occur at least once per million tokens, based on
     the average of these lists, will appear in the word frequency list.
-    If you look up a word that's not in the list, you'll get the `minimum`
-    value, which itself defaults to 0.
+
+    The value returned will always be at least as large as `minimum`.
 
     If a word decomposes into multiple tokens, we'll return a smoothed estimate
     of the word frequency that is no greater than the frequency of any of its
@@ -259,7 +259,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
             # Combine word values using the half-harmonic-mean formula,
             # (a * b) / (a + b). This operation is associative.
             combined_value = half_harmonic_mean(combined_value, value)
-    return combined_value
+    return max(combined_value, minimum)
 
 
 @lru_cache(maxsize=100)

From 993bc4da15a007e988f79df8ae21e24175cd6475 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:47:37 -0400
Subject: [PATCH 25/28] revert to using global mecab_tokenize variable

Former-commit-id: 189a5b9cd6bd36857c73ffd0bef86e38bc40da16
---
 wordfreq/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 5f2896a..bf68f14 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -43,7 +43,7 @@ def simple_tokenize(text):
     """
     return [token.casefold() for token in TOKEN_RE.findall(text)]
 
-
+mecab_tokenize = None
 def tokenize(text, lang):
     """
     Tokenize this text in a way that's straightforward but appropriate for
@@ -57,7 +57,9 @@ def tokenize(text, lang):
     first, so that they can be expected to match the data.
     """
     if lang == 'ja':
-        from wordfreq.mecab import mecab_tokenize
+        global mecab_tokenize
+        if mecab_tokenize is None:
+            from wordfreq.mecab import mecab_tokenize
         return mecab_tokenize(text)
 
     if lang == 'ar':

From 4d3123e2eedf465b5f47ad9cd7afdeecc236319a Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 16:00:24 -0400
Subject: [PATCH 26/28] cleaned up gen regex

Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2
---
 scripts/gen_regex.py | 45 +++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 1a32ac7..9801a73 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -8,25 +8,31 @@ from pkg_resources import resource_filename
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 
+def cache_regex_from_func(filename, func):
+    """
+    Generates a regex from a function that accepts a single unicode character,
+    and caches it in the data path at filename.
+    """
+    with (DATA_PATH / filename).open(mode='w') as file:
+        file.write(func_to_regex(func))
+
+
 def _emoji_char_class():
     """
     Build a regex for emoji substitution.  We create a regex character set
     (like "[a-cv-z]") matching characters we consider emoji.
     """
-    emoji_file = DATA_PATH / 'emoji.txt'
-
-    def accept(c):
-        x = ord(c)
-        return chardata.CHAR_CLASS_STRING[x] == '3' and \
-                x >= 0x2600 and x != 0xfffd
-
-    with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
-        file.write(func_to_regex(accept))
+    cache_regex_from_func(
+        'emoji.txt',
+        lambda c:
+            chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
+            c >= '\u2600' and c != '\ufffd'
+    )
 
 
 def _non_punct_class():
     """
-    Builds a regex that matches anything that is not a one of the following
+    Builds a regex that matches anything that is not one of the following
     classes:
     - P: punctuation
     - S: symbols
@@ -35,23 +41,20 @@ def _non_punct_class():
     This will classify symbols, including emoji, as punctuation; callers that
     want to treat emoji separately should filter them out first.
     """
-    non_punct_file = DATA_PATH / 'non_punct.txt'
-
-    out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
-
-    with non_punct_file.open(mode='w') as file:
-        file.write(out)
+    cache_regex_from_func(
+        'non_punct.txt',
+        lambda c: unicodedata.category(c)[0] not in 'PSZC'
+    )
 
 
 def _combining_mark_class():
     """
     Builds a regex that matches anything that is a combining mark
     """
-    combining_mark_file = DATA_PATH / 'combining_mark.txt'
-    out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
-
-    with combining_mark_file.open(mode='w') as file:
-        file.write(out)
+    cache_regex_from_func(
+        'combining_mark.txt',
+        lambda c: unicodedata.category(c)[0] == 'M'
+    )
 
 
 def func_to_regex(accept):

From 8eac6bf0062c1255d32fa6111adbd4c7a457556c Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 16:00:37 -0400
Subject: [PATCH 27/28] added documentation to load ranges

Former-commit-id: af362480d5bdb7395dd482a39898013fb723dd27
---
 wordfreq/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index bf68f14..afb1971 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -15,6 +15,9 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 CACHE_SIZE = 100000
 
 def load_range(filename):
+    """
+    Loads a file from the data path
+    """
     with (DATA_PATH / filename).open() as file:
         return file.read()
 

From b145e02ce4e98b5d3b7076993dae29c57b4c5768 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 16:21:22 -0400
Subject: [PATCH 28/28] removed unused imports

Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
---
 scripts/gen_regex.py | 1 -
 tests/test.py        | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 9801a73..38d4c39 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -1,4 +1,3 @@
-import argparse
 import unicodedata
 from ftfy import chardata
 import pathlib
diff --git a/tests/test.py b/tests/test.py
index 0a45450..ba52fb8 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,10 +1,10 @@
 from wordfreq import (
-    word_frequency, available_languages, cB_to_freq, iter_wordlist,
+    word_frequency, available_languages, cB_to_freq,
     top_n_list, random_words, random_ascii_words, tokenize,
     half_harmonic_mean
 )
 from nose.tools import (
-    eq_, assert_almost_equal, assert_greater, assert_less, raises
+    eq_, assert_almost_equal, assert_greater, raises
 )