caches non_punct regex in non_punct.txt

Former-commit-id: f576ca58ae
This commit is contained in:
Joshua Chin 2015-06-24 17:11:50 -04:00
parent f98c6c4401
commit d48a44b4e3
2 changed files with 13 additions and 3 deletions

View File

@ -29,10 +29,19 @@ EMOJI_RANGE = _emoji_char_class()
# FIXME: Find a better way to get a list of all non punctuation unicodes # FIXME: Find a better way to get a list of all non punctuation unicodes
def _non_punct_class(): def _non_punct_class():
try:
with open('non_punct.txt') as file:
return file.read()
except FileNotFoundError:
non_punct = [chr(x) for x in range(0x110000) non_punct = [chr(x) for x in range(0x110000)
if unicodedata.category(chr(x))[0] not in 'PSZMC'] if unicodedata.category(chr(x))[0] not in 'PSZMC']
return '[%s]' % ''.join(non_punct) out = '[%s]' % ''.join(non_punct)
with open('non_punct.txt', mode='w') as file:
file.write(out)
return out
NON_PUNCT_RANGE = _non_punct_class() NON_PUNCT_RANGE = _non_punct_class()

1
wordfreq/non_punct.txt Normal file

File diff suppressed because one or more lines are too long