wordfreq/scripts/gen_regex.py

import argparse
import unicodedata
from ftfy import chardata
import pathlib
from pkg_resources import resource_filename


DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))


def cache_regex_from_func(filename, func):
    """
    Generates a regex from a function that accepts a single unicode character,
    and caches it in the data path at filename.
    """
    with (DATA_PATH / filename).open(mode='w') as file:
        file.write(func_to_regex(func))


def _emoji_char_class():
    """
    Build a regex for emoji substitution.  We create a regex character set
    (like "[a-cv-z]") matching characters we consider emoji.
    """
    cache_regex_from_func(
        'emoji.txt',
        lambda c:
            chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
            c >= '\u2600' and c != '\ufffd'
    )


def _non_punct_class():
    """
    Builds a regex that matches anything that is not one of the following
    classes:
    - P: punctuation
    - S: symbols
    - Z: separators
    - C: control characters
    This will classify symbols, including emoji, as punctuation; callers that
    want to treat emoji separately should filter them out first.
    """
    cache_regex_from_func(
        'non_punct.txt',
        lambda c: unicodedata.category(c)[0] not in 'PSZC'
    )


def _combining_mark_class():
    """
    Builds a regex that matches anything that is a combining mark
    """
    cache_regex_from_func(
        'combining_mark.txt',
        lambda c: unicodedata.category(c)[0] == 'M'
    )


def func_to_regex(accept):
    """
    Converts a function that accepts a single unicode character into a regex.
    Unassigned unicode characters are treated like their neighbors.
    """
    ranges = []
    start = None
    has_accepted = False
    for x in range(0x110000):
        c = chr(x)

        if accept(c):
            has_accepted = True
            if start is None:
                start = c
        elif unicodedata.category(c) == 'Cn':
            if start is None:
                start = c
        elif start is not None:
            if has_accepted:
                ranges.append('-'.join([start, chr(x-1)]))
                has_accepted = False
            start = None
    else:
        if has_accepted and start is not None:
            ranges.append('-'.join([start, chr(x-1)]))

    return '[%s]' % ''.join(ranges)


if __name__ == '__main__':
    _combining_mark_class()
    _non_punct_class()
    _emoji_char_class()
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00			`import argparse`
			`import unicodedata`
fixed gen_regex Former-commit-id: 5510fce675c8008ddd28b3070557b5669ab27b5e 2015-07-07 19:22:04 +00:00			`from ftfy import chardata`
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00			`import pathlib`
fixed gen_regex Former-commit-id: 5510fce675c8008ddd28b3070557b5669ab27b5e 2015-07-07 19:22:04 +00:00			`from pkg_resources import resource_filename`
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00
fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 2015-07-07 19:23:15 +00:00
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00			`DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))`
updated imports Former-commit-id: f2b615b0f04d409a2a2bcf46433580a2dbea7fc5 2015-07-07 18:46:42 +00:00
fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 2015-07-07 19:23:15 +00:00
cleaned up gen regex Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2 2015-07-07 20:00:24 +00:00			`def cache_regex_from_func(filename, func):`
			`"""`
			`Generates a regex from a function that accepts a single unicode character,`
			`and caches it in the data path at filename.`
			`"""`
			`with (DATA_PATH / filename).open(mode='w') as file:`
			`file.write(func_to_regex(func))`


updated imports Former-commit-id: f2b615b0f04d409a2a2bcf46433580a2dbea7fc5 2015-07-07 18:46:42 +00:00			`def _emoji_char_class():`
			`"""`
updated docstring Former-commit-id: 9b851f3afe91177b2853c3498ff0d6b0eb7c42f8 2015-07-07 19:33:51 +00:00			`Build a regex for emoji substitution. We create a regex character set`
			`(like "[a-cv-z]") matching characters we consider emoji.`
updated imports Former-commit-id: f2b615b0f04d409a2a2bcf46433580a2dbea7fc5 2015-07-07 18:46:42 +00:00			`"""`
cleaned up gen regex Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2 2015-07-07 20:00:24 +00:00			`cache_regex_from_func(`
			`'emoji.txt',`
			`lambda c:`
			`chardata.CHAR_CLASS_STRING[ord(c)] == '3' and`
			`c >= '\u2600' and c != '\ufffd'`
			`)`
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00
fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 2015-07-07 19:23:15 +00:00
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00			`def _non_punct_class():`
			`"""`
cleaned up gen regex Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2 2015-07-07 20:00:24 +00:00			`Builds a regex that matches anything that is not one of the following`
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00			`classes:`
			`- P: punctuation`
			`- S: symbols`
			`- Z: separators`
			`- C: control characters`
			`This will classify symbols, including emoji, as punctuation; callers that`
			`want to treat emoji separately should filter them out first.`
			`"""`
cleaned up gen regex Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2 2015-07-07 20:00:24 +00:00			`cache_regex_from_func(`
			`'non_punct.txt',`
			`lambda c: unicodedata.category(c)[0] not in 'PSZC'`
			`)`
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00
fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 2015-07-07 19:23:15 +00:00
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00			`def _combining_mark_class():`
			`"""`
			`Builds a regex that matches anything that is a combining mark`
			`"""`
cleaned up gen regex Former-commit-id: 27ea107e6fc0f8e95519728565dd5618d7e8c0d2 2015-07-07 20:00:24 +00:00			`cache_regex_from_func(`
			`'combining_mark.txt',`
			`lambda c: unicodedata.category(c)[0] == 'M'`
			`)`
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00
fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 2015-07-07 19:23:15 +00:00
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00			`def func_to_regex(accept):`
			`"""`
			`Converts a function that accepts a single unicode character into a regex.`
			`Unassigned unicode characters are treated like their neighbors.`
			`"""`
			`ranges = []`
			`start = None`
			`has_accepted = False`
			`for x in range(0x110000):`
			`c = chr(x)`

			`if accept(c):`
			`has_accepted = True`
			`if start is None:`
fixed gen_regex Former-commit-id: 5510fce675c8008ddd28b3070557b5669ab27b5e 2015-07-07 19:22:04 +00:00			`start = c`
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00			`elif unicodedata.category(c) == 'Cn':`
			`if start is None:`
			`start = c`
			`elif start is not None:`
			`if has_accepted:`
			`ranges.append('-'.join([start, chr(x-1)]))`
			`has_accepted = False`
			`start = None`
			`else:`
			`if has_accepted and start is not None:`
			`ranges.append('-'.join([start, chr(x-1)]))`

			`return '[%s]' % ''.join(ranges)`

fixed spacing Former-commit-id: ae4699029d3b09621ac410c26b981266056f1747 2015-07-07 19:23:15 +00:00
factored out regex generation Former-commit-id: 476a909e4d68a7fe79244620441e3400124925e0 2015-07-07 18:38:21 +00:00			`if __name__ == '__main__':`
updated gen_regex to be run as script Former-commit-id: 22fbea424841cbd7c5181be65df224c1f6b6e971 2015-07-07 18:50:56 +00:00			`_combining_mark_class()`
			`_non_punct_class()`
			`_emoji_char_class()`