mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
parent
44e7fb5b70
commit
d4b5530d0e
@ -3,6 +3,7 @@ from functools import lru_cache
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
from ftfy import chardata
|
from ftfy import chardata
|
||||||
import langcodes
|
import langcodes
|
||||||
|
import itertools
|
||||||
import msgpack
|
import msgpack
|
||||||
import re
|
import re
|
||||||
import gzip
|
import gzip
|
||||||
@ -45,21 +46,38 @@ def _non_punct_class():
|
|||||||
This will classify symbols, including emoji, as punctuation; callers that
|
This will classify symbols, including emoji, as punctuation; callers that
|
||||||
want to treat emoji separately should filter them out first.
|
want to treat emoji separately should filter them out first.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open('data/non_punct.txt') as file:
|
with open('wordfreq/data/non_punct.txt') as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
non_punct = [chr(x) for x in range(0x110000)
|
non_punct = [x for x in range(0x110000)
|
||||||
if unicodedata.category(chr(x))[0] not in 'PSZMC']
|
if unicodedata.category(chr(x))[0] not in 'PSZMC']
|
||||||
|
|
||||||
out = '[%s]' % ''.join(non_punct)
|
non_punct_ranges = to_ranges(non_punct)
|
||||||
|
|
||||||
with open('non_punct.txt', mode='w') as file:
|
out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
|
||||||
|
for start, end in non_punct_ranges)
|
||||||
|
|
||||||
|
with open('wordfreq/data/non_punct.txt', mode='w') as file:
|
||||||
file.write(out)
|
file.write(out)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def to_ranges(seq):
|
||||||
|
"""
|
||||||
|
Converts a sequence of int's into a list of inclusives ranges
|
||||||
|
"""
|
||||||
|
ranges = []
|
||||||
|
start_range = seq[0]
|
||||||
|
for previous, elem in zip(seq, seq[1:]):
|
||||||
|
if elem - previous != 1:
|
||||||
|
ranges.append((start_range, previous))
|
||||||
|
start_range = elem
|
||||||
|
ranges.append((start_range, seq[-1]))
|
||||||
|
return ranges
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NON_PUNCT_RANGE = _non_punct_class()
|
NON_PUNCT_RANGE = _non_punct_class()
|
||||||
|
|
||||||
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user