optimized ranges and treats unassigned codepoints like their neighbors

Former-commit-id: 91a14f6e6e
2024-12-23 17:31:41 +00:00 · 2015-06-25 14:38:32 -04:00 · 2015-06-25 14:38:32 -04:00 · da370511e3
commit da370511e3
parent 59818f524f
3 changed files with 52 additions and 25 deletions
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -50,16 +50,8 @@ def _non_punct_class():
        with non_punct_file.open() as file:
            return file.read()
    except FileNotFoundError:
-        non_punct = []
-        for x in range(0x110000):
-            cat = unicodedata.category(chr(x))
-            if cat[0] not in 'PSZC' or cat == 'Cn':
-                non_punct.append(x)

-        non_punct_ranges = to_ranges(non_punct)
-
-        out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
-                for start, end in non_punct_ranges)
+        out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')

        with non_punct_file.open(mode='w') as file:
            file.write(out)
@ -75,13 +67,8 @@ def _combining_mark_class():
        with _combining_mark_file.open() as file:
            return file.read()
    except FileNotFoundError:
-        combining_mark = [x for x in range(0x110000)
-                        if unicodedata.category(chr(x))[0] == 'M']

-        combining_mark_ranges = to_ranges(combining_mark)
-
-        out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
-                for start, end in combining_mark_ranges)
+        out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')

        with _combining_mark_file.open(mode='w') as file:
            file.write(out)
@ -89,19 +76,59 @@ def _combining_mark_class():
        return out


-def to_ranges(seq):
+def func_to_ranges(accept):
    """
-    Converts a sequence of int's into a list of inclusives ranges
+    Converts a function that accepts a single unicode character into a list of
+    ranges. Unassigned unicode are automatically accepted
    """
    ranges = []
-    start_range = seq[0]
-    for previous, elem in zip(seq, seq[1:]):
-        if elem - previous != 1:
-            ranges.append((start_range, previous))
-            start_range = elem
-    ranges.append((start_range, seq[-1]))
+    start = None
+    for x in range(0x110000):
+        cat = unicodedata.category(chr(x))
+        if cat == 'Cn' or accept(chr(x)):
+            if start is None:
+                start = x
+        else:
+            if start is not None:
+                ranges.append((start, x-1))
+                start = None
+
+    if start is not None:
+        ranges.append((start, x))
+
    return ranges

+unassigned_ranges = None
+
+def func_to_regex(accept):
+    """
+    Converts a function that accepts a single unicode character into a regex.
+    Unassigned unicode characters are treated like their neighbors.
+    """
+    ranges = []
+    start = None
+    for x in range(0x110000):
+        cat = unicodedata.category(chr(x))
+        if cat == 'Cn' or accept(chr(x)):
+            if start is None:
+                start = x
+        else:
+            if start is not None:
+                ranges.append((start, x-1))
+                start = None
+
+    if start is not None:
+        ranges.append((start, x))
+
+    global unassigned_ranges
+    if unassigned_ranges is None:
+        unassigned_ranges = set(func_to_ranges(lambda _: False))
+
+    ranges = [range for range in ranges if range not in unassigned_ranges]
+
+    return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
+                                for start, end in ranges)
+

 COMBINING_MARK_RE = re.compile(_combining_mark_class())
 NON_PUNCT_RANGE = _non_punct_class()
--- a/wordfreq/data/combining_mark.txt
+++ b/wordfreq/data/combining_mark.txt
@ -1 +1 @@
-[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ࣾऀ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-ৄে-ৈো-্ৗ-ৗৢ-ৣਁ-ਃ਼-਼ਾ-ੂੇ-ੈੋ-੍ੑ-ੑੰ-ੱੵ-ੵઁ-ઃ઼-઼ા-ૅે-ૉો-્ૢ-ૣଁ-ଃ଼-଼ା-ୄେ-ୈୋ-୍ୖ-ୗୢ-ୣஂ-ஂா-ூெ-ைொ-்ௗ-ௗఁ-ఃా-ౄె-ైొ-్ౕ-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೄೆ-ೈೊ-್ೕ-ೖೢ-ೣം-ഃാ-ൄെ-ൈൊ-്ൗ-ൗൢ-ൣං-ඃ්-්ා-ුූ-ූෘ-ෟෲ-ෳั-ัิ-ฺ็-๎ັ-ັິ-ູົ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྗྙ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-ᤫᤰ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-ᩞ᩠-᩿᩼-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-ᷦ᷼-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨃𐨅-𐨆𐨌-𐨏𐨸-𐨿𐨺-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖽾𖾏-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
+[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿]
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt
@ -1 +1 @@
-[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽͿ-΃Ά-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֋-֎֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײ׵-׿؅-؅ؐ-ؚ؝-؝ؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿ܎-܎ܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿍-࿍࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥁-᥃᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽᲀ-Ჿ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ῿-῿⁥-⁥⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⏴-⏿␧-␿⑋-⒛⓪-⓿✀-✀❶-➓⭍-⭏⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ⸼-⹿⺚-⺚⻴-⻿⿖-⿯⿼-⿿々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㋿-㋿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱꛸-꛿ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹓-﹓﹧-﹧﹬-﻾＀-＀０-９Ａ-Ｚａ-ｚｦ-￟￧-￧￯-￸-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐆜-𐇏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝃶-𝃿𝄧-𝄨𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝇞-𝇿𝉂-𝉄𝉆-𝋿𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯𞻲-𞿿🀬-🀯🂔-🂟🂯-🂰🂿-🃀🃐-🃐🃠-🄏🄯-🄯🅬-🅯🆛-🇥🈃-🈏🈻-🈿🉉-🉏🉒-🋿🌡-🌯🌶-🌶🍽-🍿🎔-🎟🏅-🏅🏋-🏟🏱-🏿🐿-🐿👁-👁📸-📸📽-📿🔾-🔿🕄-🕏🕨-🗺🙁-🙄🙐-🙿🛆-🛿🝴-󠀀󠀂-󠀟󠂀-󯿿󿿾-󿿿􏿾-􏿿]
+[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾０-９Ａ-Ｚａ-ｚｦ-￟-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿]