mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
optimized ranges and treats unassigned codepoints like their neighbors
This commit is contained in:
parent
5fc448bc60
commit
91a14f6e6e
@ -50,16 +50,8 @@ def _non_punct_class():
|
||||
with non_punct_file.open() as file:
|
||||
return file.read()
|
||||
except FileNotFoundError:
|
||||
non_punct = []
|
||||
for x in range(0x110000):
|
||||
cat = unicodedata.category(chr(x))
|
||||
if cat[0] not in 'PSZC' or cat == 'Cn':
|
||||
non_punct.append(x)
|
||||
|
||||
non_punct_ranges = to_ranges(non_punct)
|
||||
|
||||
out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
|
||||
for start, end in non_punct_ranges)
|
||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
|
||||
|
||||
with non_punct_file.open(mode='w') as file:
|
||||
file.write(out)
|
||||
@ -75,13 +67,8 @@ def _combining_mark_class():
|
||||
with _combining_mark_file.open() as file:
|
||||
return file.read()
|
||||
except FileNotFoundError:
|
||||
combining_mark = [x for x in range(0x110000)
|
||||
if unicodedata.category(chr(x))[0] == 'M']
|
||||
|
||||
combining_mark_ranges = to_ranges(combining_mark)
|
||||
|
||||
out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
|
||||
for start, end in combining_mark_ranges)
|
||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
|
||||
|
||||
with _combining_mark_file.open(mode='w') as file:
|
||||
file.write(out)
|
||||
@ -89,19 +76,59 @@ def _combining_mark_class():
|
||||
return out
|
||||
|
||||
|
||||
def to_ranges(seq):
|
||||
def func_to_ranges(accept):
|
||||
"""
|
||||
Converts a sequence of int's into a list of inclusives ranges
|
||||
Converts a function that accepts a single unicode character into a list of
|
||||
ranges. Unassigned unicode are automatically accepted
|
||||
"""
|
||||
ranges = []
|
||||
start_range = seq[0]
|
||||
for previous, elem in zip(seq, seq[1:]):
|
||||
if elem - previous != 1:
|
||||
ranges.append((start_range, previous))
|
||||
start_range = elem
|
||||
ranges.append((start_range, seq[-1]))
|
||||
start = None
|
||||
for x in range(0x110000):
|
||||
cat = unicodedata.category(chr(x))
|
||||
if cat == 'Cn' or accept(chr(x)):
|
||||
if start is None:
|
||||
start = x
|
||||
else:
|
||||
if start is not None:
|
||||
ranges.append((start, x-1))
|
||||
start = None
|
||||
|
||||
if start is not None:
|
||||
ranges.append((start, x))
|
||||
|
||||
return ranges
|
||||
|
||||
unassigned_ranges = None
|
||||
|
||||
def func_to_regex(accept):
|
||||
"""
|
||||
Converts a function that accepts a single unicode character into a regex.
|
||||
Unassigned unicode characters are treated like their neighbors.
|
||||
"""
|
||||
ranges = []
|
||||
start = None
|
||||
for x in range(0x110000):
|
||||
cat = unicodedata.category(chr(x))
|
||||
if cat == 'Cn' or accept(chr(x)):
|
||||
if start is None:
|
||||
start = x
|
||||
else:
|
||||
if start is not None:
|
||||
ranges.append((start, x-1))
|
||||
start = None
|
||||
|
||||
if start is not None:
|
||||
ranges.append((start, x))
|
||||
|
||||
global unassigned_ranges
|
||||
if unassigned_ranges is None:
|
||||
unassigned_ranges = set(func_to_ranges(lambda _: False))
|
||||
|
||||
ranges = [range for range in ranges if range not in unassigned_ranges]
|
||||
|
||||
return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
|
||||
for start, end in ranges)
|
||||
|
||||
|
||||
COMBINING_MARK_RE = re.compile(_combining_mark_class())
|
||||
NON_PUNCT_RANGE = _non_punct_class()
|
||||
|
@ -1 +1 @@
|
||||
[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ࣾऀ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-ৄে-ৈো-্ৗ-ৗৢ-ৣਁ-ਃ਼-਼ਾ-ੂੇ-ੈੋ-੍ੑ-ੑੰ-ੱੵ-ੵઁ-ઃ઼-઼ા-ૅે-ૉો-્ૢ-ૣଁ-ଃ଼-଼ା-ୄେ-ୈୋ-୍ୖ-ୗୢ-ୣஂ-ஂா-ூெ-ைொ-்ௗ-ௗఁ-ఃా-ౄె-ైొ-్ౕ-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೄೆ-ೈೊ-್ೕ-ೖೢ-ೣം-ഃാ-ൄെ-ൈൊ-്ൗ-ൗൢ-ൣං-ඃ්-්ා-ුූ-ූෘ-ෟෲ-ෳั-ัิ-ฺ็-๎ັ-ັິ-ູົ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྗྙ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-ᤫᤰ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-ᩞ᩠-᩿᩼-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-ᷦ᷼-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨃𐨅-𐨆𐨌-𐨏𐨸-𐨿𐨺-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖽾𖾏-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
|
||||
[̀-ͯ҃-҉-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙-ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ--়া-্-ৢ-ৼ--ੰ-ੱੵ--઼ા-ૢ---଼ା-ୢ--ஂ---ఄా-ౢ-ಀ-಄-಼ಾ-ೝೢ-ೳ-ഄാ-്൏-ൟൢ---ෳั-ัิ-็-๎ັ-ັິ-ຼ-༘-༙༵-༵༷-༹༷-༹༾-༿-྄྆-྇ྍ-࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ-፟ᜒ-ᜟᜲ-᜴ᝒ--឴-៓៝-᠋-᠍ᢩ-ᢩᤝ--ᧀᧈ-ᨗ-ᩕ-᩿-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-ᰤ--᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⳯-⳱-⵿-〪ⷿ-〯-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ-ꢁꢴ--꣱ꤦ-꤭ꥇ--ꦃ꦳-꧀ꨩ-ꩃ-ꩃꩌ-ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-ꯣ-ꯪ꯬-ﬞ-ﬞ﷾-️-𐇽︯-𐨁-𐨏𐨴-𐨿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺-𑄂𑄧-𑅄-𑆂𑆳-𑇀𑚫-𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄-]
|
@ -1 +1 @@
|
||||
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽͿ-Ά-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ-֎-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײ--ؐ-ؚ؝-؝ؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿ-ܐ-ߵߺ---ॣ०-९ॱ-ৱ৴-৹ৼ-૯-୯ୱ-௲-౾ಀ-൸ൺ-ෳ-เ-๎๐-๙-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-࿆-࿆--၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ-ᐁ-ᙬᙯ-ᙿᚁ-ᚚ-ᛪᛮ-᜴-៓ៗ-ៗៜ-᠋-᠍᠏--᥆-ᨀ-ᨠ-ᪧ-ᪧ-᭙᭫-᭳᭽-ᰀ-᱀-ᱽᲀ-Ჿ-᳔᳒-ᾼι-ιῂ-ῌῐ-ῠ-Ῥ-ῼ--⁰-⁹ⁿ-₉-₻-ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-⏴-⏿--⒛⓪-⓿✀-✀❶-➓⭍-⭏⭚-ⳤⳫ-⳽-⳽ⴀ-ⵯ-ⷿⸯ-ⸯ⸼-----々-〇〡-〯〱-〵〸-〼-゚ゝ-ゟァ-ヺー-㆒-㆕ㆠ-ㆿ-ㇿ-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㋿-㋿㐀-䶿一--ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱-ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵-ꡳ-꣐-ꣷꣻ-꤭ꤰ-ꥠ-꧀-ꧠ-ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️-︯----0-9A-Za-zヲ-----𐅀-𐅸𐆊-𐆜-𐇽-𐎠-𐏏𐏑-𐡘-𐤠---𐩾𐪀-𐭀-𑁆-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑴---𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝇞-𝉂-𝉄--𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-----🂿--🃠-🄏🄯-🄯🅬-🅯🆛--🈻---🌡-🌯🌶-🌶🍽-🍿🎔-🎟🏅-🏅🏋-🏟🏱-🏿🐿-🐿👁-👁📸-📸📽-📿🔾-🔿🕄-🕏🕨-🗺🙁-🙄🙐-🙿🛆-🝴-----]
|
||||
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ---ॣ०-९ॱ-ৱ৴-৹ৼ-૯-୯ୱ-௲-౾ಀ-൸ൺ-ෳ-เ-๎๐-๙-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-࿆-࿆-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ-ᐁ-ᙬᙯ-ᙿᚁ-ᚚ-ᛪᛮ-᜴-៓ៗ-ៗៜ-᠋-᠍᠏-᥆-ᨀ-ᨠ-ᪧ-ᪧ-᭙᭫-᭳᭽-ᰀ-᱀-ᱽ-᳔᳒-ᾼι-ιῂ-ῌῐ-ῠ-Ῥ-ῼ⁰-⁹ⁿ-₉-₻-ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐--⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳽-⳽ⴀ-ⵯ-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼-゚ゝ-ゟァ-ヺー-㆒-㆕ㆠ-ㆿ-ㇿ-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一--ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵-ꡳ-꣐-ꣷꣻ-꤭ꤰ-ꥠ-꧀-ꧠ-ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️-︯-0-9A-Za-zヲ---𐅀-𐅸𐆊-𐇽-𐎠-𐏏𐏑-𐡘-𐤠---𐩾𐪀-𐭀-𑁆-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑴-𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-🃠-🄏🝴--]
|
Loading…
Reference in New Issue
Block a user