diff --git a/README.md b/README.md index 73afd99..c16b7d0 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,12 @@ install them on Ubuntu: sudo apt-get install mecab-ipadic-utf8 libmecab-dev pip3 install mecab-python3 +## Unicode data + +The tokenizers used to split non-Japanese phrases use regexes built using the +`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0. To +update these regexes, run `scripts/gen_regex.py`. + ## License `wordfreq` is freely redistributable under the MIT license (see diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 38d4c39..314ede2 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -4,9 +4,34 @@ import pathlib from pkg_resources import resource_filename +CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)] DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) +def func_to_regex(accept_func): + """ + Given a function that returns True or False for a numerical codepoint, + return a regex character class accepting the characters resulting in True. + Ranges separated only by unassigned characters are merged for efficiency. + """ + # parsing_range is True if the current codepoint might be in a range that + # the regex will accept + parsing_range = False + ranges = [] + + for codepoint, category in enumerate(CATEGORIES): + if accept_func(codepoint): + if not parsing_range: + ranges.append([codepoint, codepoint]) + parsing_range = True + else: + ranges[-1][1] = codepoint + elif category != 'Cn': + parsing_range = False + + return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges) + + def cache_regex_from_func(filename, func): """ Generates a regex from a function that accepts a single unicode character, @@ -16,77 +41,36 @@ def cache_regex_from_func(filename, func): file.write(func_to_regex(func)) -def _emoji_char_class(): +def _is_emoji_codepoint(i): """ - Build a regex for emoji substitution. We create a regex character set - (like "[a-cv-z]") matching characters we consider emoji. + Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So' + character (as future-proofed by the ftfy chardata module) but excluding + symbols like © and ™ below U+2600 and the replacement character U+FFFD. """ - cache_regex_from_func( - 'emoji.txt', - lambda c: - chardata.CHAR_CLASS_STRING[ord(c)] == '3' and - c >= '\u2600' and c != '\ufffd' - ) + return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd -def _non_punct_class(): +def _is_non_punct_codepoint(i): """ - Builds a regex that matches anything that is not one of the following - classes: + Report whether a numerical codepoint is not one of the following classes: - P: punctuation - S: symbols - Z: separators - C: control characters - This will classify symbols, including emoji, as punctuation; callers that - want to treat emoji separately should filter them out first. + This will classify symbols, including emoji, as punctuation; users that + want to accept emoji should add them separately. """ - cache_regex_from_func( - 'non_punct.txt', - lambda c: unicodedata.category(c)[0] not in 'PSZC' - ) + return CATEGORIES[i][0] not in 'PSZC' -def _combining_mark_class(): +def _is_combining_mark_codepoint(i): """ - Builds a regex that matches anything that is a combining mark + Report whether a numerical codepoint is a combining mark (Unicode 'M'). """ - cache_regex_from_func( - 'combining_mark.txt', - lambda c: unicodedata.category(c)[0] == 'M' - ) - - -def func_to_regex(accept): - """ - Converts a function that accepts a single unicode character into a regex. - Unassigned unicode characters are treated like their neighbors. - """ - ranges = [] - start = None - has_accepted = False - for x in range(0x110000): - c = chr(x) - - if accept(c): - has_accepted = True - if start is None: - start = c - elif unicodedata.category(c) == 'Cn': - if start is None: - start = c - elif start is not None: - if has_accepted: - ranges.append('-'.join([start, chr(x-1)])) - has_accepted = False - start = None - else: - if has_accepted and start is not None: - ranges.append('-'.join([start, chr(x-1)])) - - return '[%s]' % ''.join(ranges) + return CATEGORIES[i][0] == 'M' if __name__ == '__main__': - _combining_mark_class() - _non_punct_class() - _emoji_char_class() + cache_regex_from_func('emoji.txt', _is_emoji_codepoint) + cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint) + cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint) diff --git a/tests/test.py b/tests/test.py index ba52fb8..59d40f8 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,7 +1,6 @@ from wordfreq import ( word_frequency, available_languages, cB_to_freq, - top_n_list, random_words, random_ascii_words, tokenize, - half_harmonic_mean + top_n_list, random_words, random_ascii_words, tokenize ) from nose.tools import ( eq_, assert_almost_equal, assert_greater, raises @@ -114,12 +113,9 @@ def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) assert_almost_equal( - plant, - half_harmonic_mean( - word_frequency('plan', 'en'), - word_frequency('t', 'en') - ) - ) + 1.0 / plant, + 1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en') + ) def test_not_really_random(): diff --git a/tests/test_japanese.py b/tests/test_japanese.py index a21eedd..d5a73b3 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -1,5 +1,5 @@ from nose.tools import eq_, assert_almost_equal -from wordfreq import tokenize, word_frequency, half_harmonic_mean +from wordfreq import tokenize, word_frequency def test_tokens(): @@ -17,10 +17,7 @@ def test_combination(): ohayou_freq / 2 ) assert_almost_equal( - word_frequency('おはようございます', 'ja'), - half_harmonic_mean( - half_harmonic_mean(ohayou_freq, gozai_freq), - masu_freq - ) + 1.0 / word_frequency('おはようございます', 'ja'), + 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq ) diff --git a/tests/test_math.py b/tests/test_math.py deleted file mode 100644 index c2b3746..0000000 --- a/tests/test_math.py +++ /dev/null @@ -1,30 +0,0 @@ -from nose.tools import assert_less_equal, assert_almost_equal -from wordfreq import half_harmonic_mean -from functools import reduce -import random - - -def check_hm_properties(inputs): - # I asserted that the half-harmonic-mean formula is associative, - # commutative, monotonic, and less than or equal to its inputs. - # (Less if its inputs are strictly positive, in fact.) - # - # So let's test that what I said is true. - hm1 = reduce(half_harmonic_mean, inputs) - random.shuffle(inputs) - hm2 = reduce(half_harmonic_mean, inputs) - assert_almost_equal(hm1, hm2) - - inputs[0] *= 2 - hm3 = reduce(half_harmonic_mean, inputs) - assert_less_equal(hm2, hm3) - - -def test_half_harmonic_mean(): - for count in range(2, 6): - for rep in range(10): - # get some strictly positive arbitrary numbers - inputs = [random.expovariate(0.01) - for i in range(count)] - yield check_hm_properties, inputs - diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 63c4c26..a5ac0ec 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -10,13 +10,13 @@ import random import logging logger = logging.getLogger(__name__) -DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) CACHE_SIZE = 100000 +DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) def load_range(filename): """ - Loads a file from the data path + Load a file from the data path. """ with (DATA_PATH / filename).open() as file: return file.read() @@ -26,7 +26,6 @@ NON_PUNCT_RANGE = load_range('non_punct.txt') COMBINING_MARK_RANGE = load_range('combining_mark.txt') COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE) - TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) @@ -46,6 +45,7 @@ def simple_tokenize(text): """ return [token.casefold() for token in TOKEN_RE.findall(text)] + mecab_tokenize = None def tokenize(text, lang): """ @@ -114,13 +114,13 @@ def read_cBpack(filename): """ with gzip.open(filename, 'rb') as infile: data = msgpack.load(infile, encoding='utf-8') - header = data[0] - if ( - not isinstance(header, dict) or header.get('format') != 'cB' - or header.get('version') != 1 - ): - raise ValueError("Unexpected header: %r" % header) - return data[1:] + header = data[0] + if ( + not isinstance(header, dict) or header.get('format') != 'cB' + or header.get('version') != 1 + ): + raise ValueError("Unexpected header: %r" % header) + return data[1:] def available_languages(wordlist='combined'): @@ -209,18 +209,30 @@ def iter_wordlist(lang, wordlist='combined'): return itertools.chain(*get_frequency_list(lang, wordlist)) -def half_harmonic_mean(a, b): - """ - An associative, commutative, monotonic function that returns a value - less than or equal to both a and b. +# This dict and inner function are used to implement a "drop everything" cache +# for word_frequency(); the overheads of lru_cache() are comparable to the time +# it takes to look up frequencies from scratch, so something faster is needed. +_wf_cache = {} - Used for estimating the frequency of terms made of multiple tokens, given - the assumption that the tokens very frequently appear together. - """ - return (a * b) / (a + b) +def _word_frequency(word, lang, wordlist, minimum): + tokens = tokenize(word, lang) + if not tokens: + return minimum + # Frequencies for multiple tokens are combined using the formula + # 1 / f = 1 / f1 + 1 / f2 + ... + # Thus the resulting frequency is less than any individual frequency, and + # the smallest frequency dominates the sum. + freqs = get_frequency_dict(lang, wordlist) + one_over_result = 0.0 + for token in tokens: + if token not in freqs: + # If any word is missing, just return the default value + return minimum + one_over_result += 1.0 / freqs[token] + + return max(1.0 / one_over_result, minimum) -@lru_cache(maxsize=CACHE_SIZE) def word_frequency(word, lang, wordlist='combined', minimum=0.): """ Get the frequency of `word` in the language with code `lang`, from the @@ -246,25 +258,14 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): of the word frequency that is no greater than the frequency of any of its individual tokens. """ - freqs = get_frequency_dict(lang, wordlist) - combined_value = None - tokens = tokenize(word, lang) - - if len(tokens) == 0: - return minimum - - for token in tokens: - if token not in freqs: - # If any word is missing, just return the default value - return minimum - value = freqs[token] - if combined_value is None: - combined_value = value - else: - # Combine word values using the half-harmonic-mean formula, - # (a * b) / (a + b). This operation is associative. - combined_value = half_harmonic_mean(combined_value, value) - return max(combined_value, minimum) + args = (word, lang, wordlist, minimum) + try: + return _wf_cache[args] + except KeyError: + if len(_wf_cache) >= CACHE_SIZE: + _wf_cache.clear() + _wf_cache[args] = _word_frequency(*args) + return _wf_cache[args] @lru_cache(maxsize=100) @@ -305,8 +306,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12, "There aren't enough words in the wordlist to provide %d bits of " "entropy per word." % bits_per_word ) - selected = [random.choice(choices) for i in range(nwords)] - return ' '.join(selected) + return ' '.join([random.choice(choices) for i in range(nwords)]) def random_ascii_words(lang='en', wordlist='combined', nwords=5, diff --git a/wordfreq/data/combining_mark.txt b/wordfreq/data/combining_mark.txt index ac83af1..6dc0d7c 100644 --- a/wordfreq/data/combining_mark.txt +++ b/wordfreq/data/combining_mark.txt @@ -1 +1 @@ -[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿] \ No newline at end of file +[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯] \ No newline at end of file diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt index 15c56fb..c7f60e7 100644 --- a/wordfreq/data/emoji.txt +++ b/wordfreq/data/emoji.txt @@ -1 +1 @@ -[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿¦-¦￧-│■-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿] \ No newline at end of file +[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿] \ No newline at end of file diff --git a/wordfreq/data/non_punct.txt b/wordfreq/data/non_punct.txt index 85af3b5..1bf3b27 100644 --- a/wordfreq/data/non_punct.txt +++ b/wordfreq/data/non_punct.txt @@ -1 +1 @@ -[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾0-9A-Za-zヲ-￟￾-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿] \ No newline at end of file +[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ0-9A-Za-zヲ-ᅵ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯] \ No newline at end of file