optimized ranges and treats unassigned codepoints like their neighbors

This commit is contained in:
Joshua Chin 2015-06-25 14:38:32 -04:00
parent 5fc448bc60
commit 91a14f6e6e
3 changed files with 52 additions and 25 deletions

View File

@ -50,16 +50,8 @@ def _non_punct_class():
with non_punct_file.open() as file:
return file.read()
except FileNotFoundError:
non_punct = []
for x in range(0x110000):
cat = unicodedata.category(chr(x))
if cat[0] not in 'PSZC' or cat == 'Cn':
non_punct.append(x)
non_punct_ranges = to_ranges(non_punct)
out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
for start, end in non_punct_ranges)
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
with non_punct_file.open(mode='w') as file:
file.write(out)
@ -75,13 +67,8 @@ def _combining_mark_class():
with _combining_mark_file.open() as file:
return file.read()
except FileNotFoundError:
combining_mark = [x for x in range(0x110000)
if unicodedata.category(chr(x))[0] == 'M']
combining_mark_ranges = to_ranges(combining_mark)
out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
for start, end in combining_mark_ranges)
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
with _combining_mark_file.open(mode='w') as file:
file.write(out)
@ -89,19 +76,59 @@ def _combining_mark_class():
return out
def to_ranges(seq):
def func_to_ranges(accept):
"""
Converts a sequence of int's into a list of inclusives ranges
Converts a function that accepts a single unicode character into a list of
ranges. Unassigned unicode are automatically accepted
"""
ranges = []
start_range = seq[0]
for previous, elem in zip(seq, seq[1:]):
if elem - previous != 1:
ranges.append((start_range, previous))
start_range = elem
ranges.append((start_range, seq[-1]))
start = None
for x in range(0x110000):
cat = unicodedata.category(chr(x))
if cat == 'Cn' or accept(chr(x)):
if start is None:
start = x
else:
if start is not None:
ranges.append((start, x-1))
start = None
if start is not None:
ranges.append((start, x))
return ranges
unassigned_ranges = None
def func_to_regex(accept):
"""
Converts a function that accepts a single unicode character into a regex.
Unassigned unicode characters are treated like their neighbors.
"""
ranges = []
start = None
for x in range(0x110000):
cat = unicodedata.category(chr(x))
if cat == 'Cn' or accept(chr(x)):
if start is None:
start = x
else:
if start is not None:
ranges.append((start, x-1))
start = None
if start is not None:
ranges.append((start, x))
global unassigned_ranges
if unassigned_ranges is None:
unassigned_ranges = set(func_to_ranges(lambda _: False))
ranges = [range for range in ranges if range not in unassigned_ranges]
return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
for start, end in ranges)
COMBINING_MARK_RE = re.compile(_combining_mark_class())
NON_PUNCT_RANGE = _non_punct_class()

View File

@ -1 +1 @@
[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ࣾऀ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-ৄে-ৈো-্ৗ-ৗৢ-ৣਁ-ਃ਼-਼ਾ-ੂੇ-ੈੋ-੍ੑ-ੑੰ-ੱੵ-ੵઁ-ઃ઼-઼ા-ૅે-ૉો-્ૢ-ૣଁ-ଃ଼-଼ା-ୄେ-ୈୋ-୍ୖ-ୗୢ-ୣஂ-ஂா-ூெ-ைொ-்ௗ-ௗఁ-ఃా-ౄె-ైొ-్ౕ-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೄೆ-ೈೊ-್ೕ-ೖೢ-ೣം-ഃാ-ൄെ-ൈൊ-്ൗ-ൗൢ-ൣං-ඃ්-්ා-ුූ-ූෘ-ෟෲ-ෳั-ัิ-ฺ็-๎ັ-ັິ-ູົ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྗྙ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-ᤫᤰ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-ᩞ᩠-᩿᩼-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-ᷦ᷼-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨃𐨅-𐨆𐨌-𐨏𐨸-𐨿𐨺-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖽾𖾏-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿]

View File

@ -1 +1 @@
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽͿΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֋-֎֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײ׵-׿؅-؅ؐ-ؚ؝-؝ؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿ܎-܎ܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿍-࿍࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥁-᥃᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽᲀ-Ჿ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ῿-῿⁥-⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ----ℤΩ-Ωℨ---ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⏴-⏿␧-␿⑋-⒛⓪-⓿✀-✀❶-➓⭍-⭏⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ⸼-⹿⺚-⺚⻴-⻿⿖-⿯⿼-⿿々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㋿-㋿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱꛸-꛿ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹓-﹓﹧-﹧﹬-﻾＀-＀---zヲ-￟￧-￧￯-￾-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐆜-𐇏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝃶-𝃿𝄧-𝄨𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝇞-𝇿𝉂-𝉄𝉆-𝋿𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯𞻲-𞿿🀬-🀯🂔-🂟🂯-🂰🂿-🃀🃐-🃐🃠-🄏🄯-🄯🅬-🅯🆛-🇥🈃-🈏🈻-🈿🉉-🉏🉒-🋿🌡-🌯🌶-🌶🍽-🍿🎔-🎟🏅-🏅🏋-🏟🏱-🏿🐿-🐿👁-👁📸-📸📽-📿🔾-🔿🕄-🕏🕨-🗺🙁-🙄🙐-🙿🛆-🛿🝴-󠀀󠀂-󠀟󠂀-󯿿󿿾-󿿿􏿾-􏿿]
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ----ℤΩ-Ωℨ---ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾0---zヲ-￟￾-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿]