wordfreq/scripts/gen_regex.py

import argparse
import unicodedata
import chardata

def _emoji_char_class():
    """
    Build a regex for emoji substitution.  First we create a regex character set
    (like "[a-cv-z]") matching characters we consider emoji The final regex
    matches one such character followed by any number of spaces and identical
    characters.
    """
    ranges = []
    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
        if c == '3' and i >= 0x2600 and i != 0xfffd:
            if ranges and i == ranges[-1][1] + 1:
                ranges[-1][1] = i
            else:
                ranges.append([i, i])
    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)

def func_to_regex(accept):
    """
    Converts a function that accepts a single unicode character into a regex.
    Unassigned unicode characters are treated like their neighbors.
    """
    ranges = []
    start = None
    has_accepted = False
    for x in range(0x110000):
        c = chr(x)

        if accept(c):
            has_accepted = True
            if start is None:
                start = None
        elif unicodedata.category(c) == 'Cn':
            if start is None:
                start = c
        elif start is not None:
            if has_accepted:
                ranges.append('-'.join([start, chr(x-1)]))
                has_accepted = False
            start = None
    else:
        if has_accepted and start is not None:
            ranges.append('-'.join([start, chr(x-1)]))

    return '[%s]' % ''.join(ranges)

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Generate a regex matching a function')
    parser.add_argument('acceptor', help='an python function that accepts a single char')
    args = parser.parse_args()
    print(func_to_regex(eval(args.acceptor)))
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00			`import argparse`
			`import unicodedata`
updated imports Former-commit-id: f2b615b0f04d409a2a2bcf46433580a2dbea7fc5 2015-07-07 18:46:42 +00:00			`import chardata`

			`def _emoji_char_class():`
			`"""`
			`Build a regex for emoji substitution. First we create a regex character set`
			`(like "[a-cv-z]") matching characters we consider emoji The final regex`
			`matches one such character followed by any number of spaces and identical`
			`characters.`
			`"""`
			`ranges = []`
			`for i, c in enumerate(chardata.CHAR_CLASS_STRING):`
			`if c == '3' and i >= 0x2600 and i != 0xfffd:`
			`if ranges and i == ranges[-1][1] + 1:`
			`ranges[-1][1] = i`
			`else:`
			`ranges.append([i, i])`
			`return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)`
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00
			`def func_to_regex(accept):`
			`"""`
			`Converts a function that accepts a single unicode character into a regex.`
			`Unassigned unicode characters are treated like their neighbors.`
			`"""`
			`ranges = []`
			`start = None`
			`has_accepted = False`
			`for x in range(0x110000):`
			`c = chr(x)`

			`if accept(c):`
			`has_accepted = True`
			`if start is None:`
			`start = None`
			`elif unicodedata.category(c) == 'Cn':`
			`if start is None:`
			`start = c`
			`elif start is not None:`
			`if has_accepted:`
			`ranges.append('-'.join([start, chr(x-1)]))`
			`has_accepted = False`
			`start = None`
			`else:`
			`if has_accepted and start is not None:`
			`ranges.append('-'.join([start, chr(x-1)]))`

			`return '[%s]' % ''.join(ranges)`

			`if __name__ == '__main__':`
			`import argparse`

			`parser = argparse.ArgumentParser(description='Generate a regex matching a function')`
			`parser.add_argument('acceptor', help='an python function that accepts a single char')`
			`args = parser.parse_args()`
			`print(func_to_regex(eval(args.acceptor)))`