Merge pull request #16 from LuminosoInsight/more-tweaking

More tweaking
2024-12-23 17:31:41 +00:00 · 2015-07-10 14:36:18 -04:00 · 2015-07-10 14:36:18 -04:00 · 2c573b5a0e
commit 2c573b5a0e
parent d4d7b2f72e 6efdaa308c
9 changed files with 97 additions and 144 deletions
--- a/README.md
+++ b/README.md
@ -21,6 +21,12 @@ install them on Ubuntu:
    sudo apt-get install mecab-ipadic-utf8 libmecab-dev
    pip3 install mecab-python3

+## Unicode data
+
+The tokenizers used to split non-Japanese phrases use regexes built using the
+`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0.  To
+update these regexes, run `scripts/gen_regex.py`.
+
 ## License

 `wordfreq` is freely redistributable under the MIT license (see
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -4,9 +4,34 @@ import pathlib
 from pkg_resources import resource_filename


+CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))


+def func_to_regex(accept_func):
+    """
+    Given a function that returns True or False for a numerical codepoint,
+    return a regex character class accepting the characters resulting in True.
+    Ranges separated only by unassigned characters are merged for efficiency.
+    """
+    # parsing_range is True if the current codepoint might be in a range that
+    # the regex will accept
+    parsing_range = False
+    ranges = []
+
+    for codepoint, category in enumerate(CATEGORIES):
+        if accept_func(codepoint):
+            if not parsing_range:
+                ranges.append([codepoint, codepoint])
+                parsing_range = True
+            else:
+                ranges[-1][1] = codepoint
+        elif category != 'Cn':
+            parsing_range = False
+
+    return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
+
+
 def cache_regex_from_func(filename, func):
    """
    Generates a regex from a function that accepts a single unicode character,
@ -16,77 +41,36 @@ def cache_regex_from_func(filename, func):
        file.write(func_to_regex(func))


-def _emoji_char_class():
+def _is_emoji_codepoint(i):
    """
-    Build a regex for emoji substitution.  We create a regex character set
-    (like "[a-cv-z]") matching characters we consider emoji.
+    Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
+    character (as future-proofed by the ftfy chardata module) but excluding
+    symbols like © and ™ below U+2600 and the replacement character U+FFFD.
    """
-    cache_regex_from_func(
-        'emoji.txt',
-        lambda c:
-            chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
-            c >= '\u2600' and c != '\ufffd'
-    )
+    return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd


-def _non_punct_class():
+def _is_non_punct_codepoint(i):
    """
-    Builds a regex that matches anything that is not one of the following
-    classes:
+    Report whether a numerical codepoint is not one of the following classes:
    - P: punctuation
    - S: symbols
    - Z: separators
    - C: control characters
-    This will classify symbols, including emoji, as punctuation; callers that
-    want to treat emoji separately should filter them out first.
+    This will classify symbols, including emoji, as punctuation; users that
+    want to accept emoji should add them separately.
    """
-    cache_regex_from_func(
-        'non_punct.txt',
-        lambda c: unicodedata.category(c)[0] not in 'PSZC'
-    )
+    return CATEGORIES[i][0] not in 'PSZC'


-def _combining_mark_class():
+def _is_combining_mark_codepoint(i):
    """
-    Builds a regex that matches anything that is a combining mark
+    Report whether a numerical codepoint is a combining mark (Unicode 'M').
    """
-    cache_regex_from_func(
-        'combining_mark.txt',
-        lambda c: unicodedata.category(c)[0] == 'M'
-    )
-
-
-def func_to_regex(accept):
-    """
-    Converts a function that accepts a single unicode character into a regex.
-    Unassigned unicode characters are treated like their neighbors.
-    """
-    ranges = []
-    start = None
-    has_accepted = False
-    for x in range(0x110000):
-        c = chr(x)
-
-        if accept(c):
-            has_accepted = True
-            if start is None:
-                start = c
-        elif unicodedata.category(c) == 'Cn':
-            if start is None:
-                start = c
-        elif start is not None:
-            if has_accepted:
-                ranges.append('-'.join([start, chr(x-1)]))
-                has_accepted = False
-            start = None
-    else:
-        if has_accepted and start is not None:
-            ranges.append('-'.join([start, chr(x-1)]))
-
-    return '[%s]' % ''.join(ranges)
+    return CATEGORIES[i][0] == 'M'


 if __name__ == '__main__':
-    _combining_mark_class()
-    _non_punct_class()
-    _emoji_char_class()
+    cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
+    cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
+    cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)
--- a/tests/test.py
+++ b/tests/test.py
@ -1,7 +1,6 @@
 from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
-    top_n_list, random_words, random_ascii_words, tokenize,
-    half_harmonic_mean
+    top_n_list, random_words, random_ascii_words, tokenize
 )
 from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
@ -114,12 +113,9 @@ def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
    assert_almost_equal(
-        plant,
-        half_harmonic_mean(
-            word_frequency('plan', 'en'),
-            word_frequency('t', 'en')
-            )
-        )
+        1.0 / plant,
+        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
+    )


 def test_not_really_random():
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -1,5 +1,5 @@
 from nose.tools import eq_, assert_almost_equal
-from wordfreq import tokenize, word_frequency, half_harmonic_mean
+from wordfreq import tokenize, word_frequency


 def test_tokens():
@ -17,10 +17,7 @@ def test_combination():
        ohayou_freq / 2
    )
    assert_almost_equal(
-        word_frequency('おはようございます', 'ja'),
-        half_harmonic_mean(
-            half_harmonic_mean(ohayou_freq, gozai_freq),
-            masu_freq
-        )
+        1.0 / word_frequency('おはようございます', 'ja'),
+        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
    )

--- a/tests/test_math.py
+++ b/tests/test_math.py
@ -1,30 +0,0 @@
-from nose.tools import assert_less_equal, assert_almost_equal
-from wordfreq import half_harmonic_mean
-from functools import reduce
-import random
-
-
-def check_hm_properties(inputs):
-    # I asserted that the half-harmonic-mean formula is associative,
-    # commutative, monotonic, and less than or equal to its inputs.
-    # (Less if its inputs are strictly positive, in fact.)
-    #
-    # So let's test that what I said is true.
-    hm1 = reduce(half_harmonic_mean, inputs)
-    random.shuffle(inputs)
-    hm2 = reduce(half_harmonic_mean, inputs)
-    assert_almost_equal(hm1, hm2)
-
-    inputs[0] *= 2
-    hm3 = reduce(half_harmonic_mean, inputs)
-    assert_less_equal(hm2, hm3)
-
-
-def test_half_harmonic_mean():
-    for count in range(2, 6):
-        for rep in range(10):
-            # get some strictly positive arbitrary numbers
-            inputs = [random.expovariate(0.01)
-                      for i in range(count)]
-            yield check_hm_properties, inputs
-
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -10,13 +10,13 @@ import random
 import logging
 logger = logging.getLogger(__name__)

-DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

 CACHE_SIZE = 100000
+DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

 def load_range(filename):
    """
-    Loads a file from the data path
+    Load a file from the data path.
    """
    with (DATA_PATH / filename).open() as file:
        return file.read()
@ -26,7 +26,6 @@ NON_PUNCT_RANGE = load_range('non_punct.txt')
 COMBINING_MARK_RANGE = load_range('combining_mark.txt')

 COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
-
 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))


@ -46,6 +45,7 @@ def simple_tokenize(text):
    """
    return [token.casefold() for token in TOKEN_RE.findall(text)]

+
 mecab_tokenize = None
 def tokenize(text, lang):
    """
@ -114,13 +114,13 @@ def read_cBpack(filename):
    """
    with gzip.open(filename, 'rb') as infile:
        data = msgpack.load(infile, encoding='utf-8')
-        header = data[0]
-        if (
-            not isinstance(header, dict) or header.get('format') != 'cB'
-            or header.get('version') != 1
-        ):
-            raise ValueError("Unexpected header: %r" % header)
-        return data[1:]
+    header = data[0]
+    if (
+        not isinstance(header, dict) or header.get('format') != 'cB'
+        or header.get('version') != 1
+    ):
+        raise ValueError("Unexpected header: %r" % header)
+    return data[1:]


 def available_languages(wordlist='combined'):
@ -209,18 +209,30 @@ def iter_wordlist(lang, wordlist='combined'):
    return itertools.chain(*get_frequency_list(lang, wordlist))


-def half_harmonic_mean(a, b):
-    """
-    An associative, commutative, monotonic function that returns a value
-    less than or equal to both a and b.
+# This dict and inner function are used to implement a "drop everything" cache
+# for word_frequency(); the overheads of lru_cache() are comparable to the time
+# it takes to look up frequencies from scratch, so something faster is needed.
+_wf_cache = {}

-    Used for estimating the frequency of terms made of multiple tokens, given
-    the assumption that the tokens very frequently appear together.
-    """
-    return (a * b) / (a + b)
+def _word_frequency(word, lang, wordlist, minimum):
+    tokens = tokenize(word, lang)
+    if not tokens:
+        return minimum

+    # Frequencies for multiple tokens are combined using the formula
+    #     1 / f = 1 / f1 + 1 / f2 + ...
+    # Thus the resulting frequency is less than any individual frequency, and
+    # the smallest frequency dominates the sum.
+    freqs = get_frequency_dict(lang, wordlist)
+    one_over_result = 0.0
+    for token in tokens:
+        if token not in freqs:
+            # If any word is missing, just return the default value
+            return minimum
+        one_over_result += 1.0 / freqs[token]
+
+    return max(1.0 / one_over_result, minimum)

-@lru_cache(maxsize=CACHE_SIZE)
 def word_frequency(word, lang, wordlist='combined', minimum=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
@ -246,25 +258,14 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
    of the word frequency that is no greater than the frequency of any of its
    individual tokens.
    """
-    freqs = get_frequency_dict(lang, wordlist)
-    combined_value = None
-    tokens = tokenize(word, lang)
-
-    if len(tokens) == 0:
-        return minimum
-
-    for token in tokens:
-        if token not in freqs:
-            # If any word is missing, just return the default value
-            return minimum
-        value = freqs[token]
-        if combined_value is None:
-            combined_value = value
-        else:
-            # Combine word values using the half-harmonic-mean formula,
-            # (a * b) / (a + b). This operation is associative.
-            combined_value = half_harmonic_mean(combined_value, value)
-    return max(combined_value, minimum)
+    args = (word, lang, wordlist, minimum)
+    try:
+        return _wf_cache[args]
+    except KeyError:
+        if len(_wf_cache) >= CACHE_SIZE:
+            _wf_cache.clear()
+        _wf_cache[args] = _word_frequency(*args)
+        return _wf_cache[args]


@lru_cache(maxsize=100)
@ -305,8 +306,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
            "There aren't enough words in the wordlist to provide %d bits of "
            "entropy per word." % bits_per_word
        )
-    selected = [random.choice(choices) for i in range(nwords)]
-    return ' '.join(selected)
+    return ' '.join([random.choice(choices) for i in range(nwords)])


 def random_ascii_words(lang='en', wordlist='combined', nwords=5,
--- a/wordfreq/data/combining_mark.txt
+++ b/wordfreq/data/combining_mark.txt
@ -1 +1 @@
-[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿]
+[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@ -1 +1 @@
-[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]
+[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt
@ -1 +1 @@
-[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾０-９Ａ-Ｚａ-ｚｦ-￟-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿]
+[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ０-９Ａ-Ｚａ-ｚｦ-ￜ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]