From 7c189ef129aa966e519455fd926f549c58659fc4 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Fri, 10 Jul 2015 11:03:57 -0400
Subject: [PATCH] created alternate implementation of func-to-regex

---
 scripts/gen_regex.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 0fee536..086b648 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -8,29 +8,30 @@ CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 
-def func_to_regex(func):
+def func_to_regex(accept):
     """
     Given a function that returns True or False for a numerical codepoint,
     return a regex character class accepting the characters resulting in True.
     Ranges separated only by unassigned characters are merged for efficiency.
     """
-    # A list of [start, end (accepted), end (accepted or unassigned)] lists
+    # start and end of the range we are currently parsing. `start` is None if
+    # we are not parsing a range.
+    start = end = None
     ranges = []
 
-    for i, cat in enumerate(CATEGORIES):
-        if func(i):
-            # If the last range can be extended, do so; else start a new one
-            if ranges and ranges[-1][2] == i - 1:
-                ranges[-1][1] = i
-                ranges[-1][2] = i
-            else:
-                ranges.append([i, i, i])
-        elif cat == 'Cn':
-            # If the last range can be extended, do so
-            if ranges and ranges[-1][2] == i - 1:
-                ranges[-1][2] = i
+    for codepoint, category in enumerate(CATEGORIES):
+        if accept(codepoint):
+            if start is None:
+                start = codepoint
+            end = codepoint
+        elif category != 'Cn' and start is not None:
+            ranges.append((start, end))
+            start = end = None
 
-    return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges)
+    if start is not None:
+        ranges.append((start, end))
+
+    return '[%s]' % ''.join('%s-%s' % (chr(r[0]), chr(r[1])) for r in ranges)
 
 
 def cache_regex_from_func(filename, func):