caches non_punct regex in non_punct.txt

This commit is contained in:
Joshua Chin 2015-06-24 17:11:50 -04:00
parent 78c5b589c5
commit f576ca58ae
2 changed files with 13 additions and 3 deletions

View File

@ -29,10 +29,19 @@ EMOJI_RANGE = _emoji_char_class()
# FIXME: Find a better way to get a list of all non punctuation unicodes
def _non_punct_class():
non_punct = [chr(x) for x in range(0x110000)
if unicodedata.category(chr(x))[0] not in 'PSZMC']
try:
with open('non_punct.txt') as file:
return file.read()
except FileNotFoundError:
non_punct = [chr(x) for x in range(0x110000)
if unicodedata.category(chr(x))[0] not in 'PSZMC']
return '[%s]' % ''.join(non_punct)
out = '[%s]' % ''.join(non_punct)
with open('non_punct.txt', mode='w') as file:
file.write(out)
return out
NON_PUNCT_RANGE = _non_punct_class()

1
wordfreq/non_punct.txt Normal file

File diff suppressed because one or more lines are too long