Merge pull request #16 from LuminosoInsight/more-tweaking

More tweaking
This commit is contained in:
Joshua Chin 2015-07-10 14:36:18 -04:00
commit 2c573b5a0e
9 changed files with 97 additions and 144 deletions

View File

@ -21,6 +21,12 @@ install them on Ubuntu:
sudo apt-get install mecab-ipadic-utf8 libmecab-dev sudo apt-get install mecab-ipadic-utf8 libmecab-dev
pip3 install mecab-python3 pip3 install mecab-python3
## Unicode data
The tokenizers used to split non-Japanese phrases use regexes built using the
`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0. To
update these regexes, run `scripts/gen_regex.py`.
## License ## License
`wordfreq` is freely redistributable under the MIT license (see `wordfreq` is freely redistributable under the MIT license (see

View File

@ -4,9 +4,34 @@ import pathlib
from pkg_resources import resource_filename from pkg_resources import resource_filename
CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def func_to_regex(accept_func):
"""
Given a function that returns True or False for a numerical codepoint,
return a regex character class accepting the characters resulting in True.
Ranges separated only by unassigned characters are merged for efficiency.
"""
# parsing_range is True if the current codepoint might be in a range that
# the regex will accept
parsing_range = False
ranges = []
for codepoint, category in enumerate(CATEGORIES):
if accept_func(codepoint):
if not parsing_range:
ranges.append([codepoint, codepoint])
parsing_range = True
else:
ranges[-1][1] = codepoint
elif category != 'Cn':
parsing_range = False
return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
def cache_regex_from_func(filename, func): def cache_regex_from_func(filename, func):
""" """
Generates a regex from a function that accepts a single unicode character, Generates a regex from a function that accepts a single unicode character,
@ -16,77 +41,36 @@ def cache_regex_from_func(filename, func):
file.write(func_to_regex(func)) file.write(func_to_regex(func))
def _emoji_char_class(): def _is_emoji_codepoint(i):
""" """
Build a regex for emoji substitution. We create a regex character set Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
(like "[a-cv-z]") matching characters we consider emoji. character (as future-proofed by the ftfy chardata module) but excluding
symbols like © and below U+2600 and the replacement character U+FFFD.
""" """
cache_regex_from_func( return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
'emoji.txt',
lambda c:
chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
c >= '\u2600' and c != '\ufffd'
)
def _non_punct_class(): def _is_non_punct_codepoint(i):
""" """
Builds a regex that matches anything that is not one of the following Report whether a numerical codepoint is not one of the following classes:
classes:
- P: punctuation - P: punctuation
- S: symbols - S: symbols
- Z: separators - Z: separators
- C: control characters - C: control characters
This will classify symbols, including emoji, as punctuation; callers that This will classify symbols, including emoji, as punctuation; users that
want to treat emoji separately should filter them out first. want to accept emoji should add them separately.
""" """
cache_regex_from_func( return CATEGORIES[i][0] not in 'PSZC'
'non_punct.txt',
lambda c: unicodedata.category(c)[0] not in 'PSZC'
)
def _combining_mark_class(): def _is_combining_mark_codepoint(i):
""" """
Builds a regex that matches anything that is a combining mark Report whether a numerical codepoint is a combining mark (Unicode 'M').
""" """
cache_regex_from_func( return CATEGORIES[i][0] == 'M'
'combining_mark.txt',
lambda c: unicodedata.category(c)[0] == 'M'
)
def func_to_regex(accept):
"""
Converts a function that accepts a single unicode character into a regex.
Unassigned unicode characters are treated like their neighbors.
"""
ranges = []
start = None
has_accepted = False
for x in range(0x110000):
c = chr(x)
if accept(c):
has_accepted = True
if start is None:
start = c
elif unicodedata.category(c) == 'Cn':
if start is None:
start = c
elif start is not None:
if has_accepted:
ranges.append('-'.join([start, chr(x-1)]))
has_accepted = False
start = None
else:
if has_accepted and start is not None:
ranges.append('-'.join([start, chr(x-1)]))
return '[%s]' % ''.join(ranges)
if __name__ == '__main__': if __name__ == '__main__':
_combining_mark_class() cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
_non_punct_class() cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
_emoji_char_class() cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)

View File

@ -1,7 +1,6 @@
from wordfreq import ( from wordfreq import (
word_frequency, available_languages, cB_to_freq, word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize, top_n_list, random_words, random_ascii_words, tokenize
half_harmonic_mean
) )
from nose.tools import ( from nose.tools import (
eq_, assert_almost_equal, assert_greater, raises eq_, assert_almost_equal, assert_greater, raises
@ -114,11 +113,8 @@ def test_phrase_freq():
plant = word_frequency("plan.t", 'en') plant = word_frequency("plan.t", 'en')
assert_greater(plant, 0) assert_greater(plant, 0)
assert_almost_equal( assert_almost_equal(
plant, 1.0 / plant,
half_harmonic_mean( 1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
word_frequency('plan', 'en'),
word_frequency('t', 'en')
)
) )

View File

@ -1,5 +1,5 @@
from nose.tools import eq_, assert_almost_equal from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency, half_harmonic_mean from wordfreq import tokenize, word_frequency
def test_tokens(): def test_tokens():
@ -17,10 +17,7 @@ def test_combination():
ohayou_freq / 2 ohayou_freq / 2
) )
assert_almost_equal( assert_almost_equal(
word_frequency('おはようございます', 'ja'), 1.0 / word_frequency('おはようございます', 'ja'),
half_harmonic_mean( 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
half_harmonic_mean(ohayou_freq, gozai_freq),
masu_freq
)
) )

View File

@ -1,30 +0,0 @@
from nose.tools import assert_less_equal, assert_almost_equal
from wordfreq import half_harmonic_mean
from functools import reduce
import random
def check_hm_properties(inputs):
# I asserted that the half-harmonic-mean formula is associative,
# commutative, monotonic, and less than or equal to its inputs.
# (Less if its inputs are strictly positive, in fact.)
#
# So let's test that what I said is true.
hm1 = reduce(half_harmonic_mean, inputs)
random.shuffle(inputs)
hm2 = reduce(half_harmonic_mean, inputs)
assert_almost_equal(hm1, hm2)
inputs[0] *= 2
hm3 = reduce(half_harmonic_mean, inputs)
assert_less_equal(hm2, hm3)
def test_half_harmonic_mean():
for count in range(2, 6):
for rep in range(10):
# get some strictly positive arbitrary numbers
inputs = [random.expovariate(0.01)
for i in range(count)]
yield check_hm_properties, inputs

View File

@ -10,13 +10,13 @@ import random
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
CACHE_SIZE = 100000 CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def load_range(filename): def load_range(filename):
""" """
Loads a file from the data path Load a file from the data path.
""" """
with (DATA_PATH / filename).open() as file: with (DATA_PATH / filename).open() as file:
return file.read() return file.read()
@ -26,7 +26,6 @@ NON_PUNCT_RANGE = load_range('non_punct.txt')
COMBINING_MARK_RANGE = load_range('combining_mark.txt') COMBINING_MARK_RANGE = load_range('combining_mark.txt')
COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE) COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
@ -46,6 +45,7 @@ def simple_tokenize(text):
""" """
return [token.casefold() for token in TOKEN_RE.findall(text)] return [token.casefold() for token in TOKEN_RE.findall(text)]
mecab_tokenize = None mecab_tokenize = None
def tokenize(text, lang): def tokenize(text, lang):
""" """
@ -209,18 +209,30 @@ def iter_wordlist(lang, wordlist='combined'):
return itertools.chain(*get_frequency_list(lang, wordlist)) return itertools.chain(*get_frequency_list(lang, wordlist))
def half_harmonic_mean(a, b): # This dict and inner function are used to implement a "drop everything" cache
""" # for word_frequency(); the overheads of lru_cache() are comparable to the time
An associative, commutative, monotonic function that returns a value # it takes to look up frequencies from scratch, so something faster is needed.
less than or equal to both a and b. _wf_cache = {}
Used for estimating the frequency of terms made of multiple tokens, given def _word_frequency(word, lang, wordlist, minimum):
the assumption that the tokens very frequently appear together. tokens = tokenize(word, lang)
""" if not tokens:
return (a * b) / (a + b) return minimum
# Frequencies for multiple tokens are combined using the formula
# 1 / f = 1 / f1 + 1 / f2 + ...
# Thus the resulting frequency is less than any individual frequency, and
# the smallest frequency dominates the sum.
freqs = get_frequency_dict(lang, wordlist)
one_over_result = 0.0
for token in tokens:
if token not in freqs:
# If any word is missing, just return the default value
return minimum
one_over_result += 1.0 / freqs[token]
return max(1.0 / one_over_result, minimum)
@lru_cache(maxsize=CACHE_SIZE)
def word_frequency(word, lang, wordlist='combined', minimum=0.): def word_frequency(word, lang, wordlist='combined', minimum=0.):
""" """
Get the frequency of `word` in the language with code `lang`, from the Get the frequency of `word` in the language with code `lang`, from the
@ -246,25 +258,14 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
of the word frequency that is no greater than the frequency of any of its of the word frequency that is no greater than the frequency of any of its
individual tokens. individual tokens.
""" """
freqs = get_frequency_dict(lang, wordlist) args = (word, lang, wordlist, minimum)
combined_value = None try:
tokens = tokenize(word, lang) return _wf_cache[args]
except KeyError:
if len(tokens) == 0: if len(_wf_cache) >= CACHE_SIZE:
return minimum _wf_cache.clear()
_wf_cache[args] = _word_frequency(*args)
for token in tokens: return _wf_cache[args]
if token not in freqs:
# If any word is missing, just return the default value
return minimum
value = freqs[token]
if combined_value is None:
combined_value = value
else:
# Combine word values using the half-harmonic-mean formula,
# (a * b) / (a + b). This operation is associative.
combined_value = half_harmonic_mean(combined_value, value)
return max(combined_value, minimum)
@lru_cache(maxsize=100) @lru_cache(maxsize=100)
@ -305,8 +306,7 @@ def random_words(lang='en', wordlist='combined', nwords=5, bits_per_word=12,
"There aren't enough words in the wordlist to provide %d bits of " "There aren't enough words in the wordlist to provide %d bits of "
"entropy per word." % bits_per_word "entropy per word." % bits_per_word
) )
selected = [random.choice(choices) for i in range(nwords)] return ' '.join([random.choice(choices) for i in range(nwords)])
return ' '.join(selected)
def random_ascii_words(lang='en', wordlist='combined', nwords=5, def random_ascii_words(lang='en', wordlist='combined', nwords=5,

View File

@ -1 +1 @@
[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿] [̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]

View File

@ -1 +1 @@
[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿¦-¦￧-│■--𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿] [☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]

View File

@ -1 +1 @@
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ----ℤΩ-Ωℨ---ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾0---zヲ-￟￾-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿] [0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ----ℤΩ-Ωℨ---ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ0---zヲ-ᅵ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]