mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
caches non_punct regex in non_punct.txt
This commit is contained in:
parent
78c5b589c5
commit
f576ca58ae
@ -29,10 +29,19 @@ EMOJI_RANGE = _emoji_char_class()
|
||||
|
||||
# FIXME: Find a better way to get a list of all non punctuation unicodes
|
||||
def _non_punct_class():
|
||||
non_punct = [chr(x) for x in range(0x110000)
|
||||
if unicodedata.category(chr(x))[0] not in 'PSZMC']
|
||||
try:
|
||||
with open('non_punct.txt') as file:
|
||||
return file.read()
|
||||
except FileNotFoundError:
|
||||
non_punct = [chr(x) for x in range(0x110000)
|
||||
if unicodedata.category(chr(x))[0] not in 'PSZMC']
|
||||
|
||||
return '[%s]' % ''.join(non_punct)
|
||||
out = '[%s]' % ''.join(non_punct)
|
||||
|
||||
with open('non_punct.txt', mode='w') as file:
|
||||
file.write(out)
|
||||
|
||||
return out
|
||||
|
||||
NON_PUNCT_RANGE = _non_punct_class()
|
||||
|
||||
|
1
wordfreq/non_punct.txt
Normal file
1
wordfreq/non_punct.txt
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user