now uses ranges

Former-commit-id: f3a365fda9
This commit is contained in:
Joshua Chin 2015-06-25 10:39:04 -04:00
parent 44e7fb5b70
commit d4b5530d0e
2 changed files with 24 additions and 6 deletions

View File

@ -3,6 +3,7 @@ from functools import lru_cache
import unicodedata import unicodedata
from ftfy import chardata from ftfy import chardata
import langcodes import langcodes
import itertools
import msgpack import msgpack
import re import re
import gzip import gzip
@ -45,21 +46,38 @@ def _non_punct_class():
This will classify symbols, including emoji, as punctuation; callers that This will classify symbols, including emoji, as punctuation; callers that
want to treat emoji separately should filter them out first. want to treat emoji separately should filter them out first.
""" """
try: try:
with open('data/non_punct.txt') as file: with open('wordfreq/data/non_punct.txt') as file:
return file.read() return file.read()
except FileNotFoundError: except FileNotFoundError:
non_punct = [chr(x) for x in range(0x110000) non_punct = [x for x in range(0x110000)
if unicodedata.category(chr(x))[0] not in 'PSZMC'] if unicodedata.category(chr(x))[0] not in 'PSZMC']
out = '[%s]' % ''.join(non_punct) non_punct_ranges = to_ranges(non_punct)
with open('non_punct.txt', mode='w') as file: out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
for start, end in non_punct_ranges)
with open('wordfreq/data/non_punct.txt', mode='w') as file:
file.write(out) file.write(out)
return out return out
def to_ranges(seq):
"""
Converts a sequence of int's into a list of inclusives ranges
"""
ranges = []
start_range = seq[0]
for previous, elem in zip(seq, seq[1:]):
if elem - previous != 1:
ranges.append((start_range, previous))
start_range = elem
ranges.append((start_range, seq[-1]))
return ranges
NON_PUNCT_RANGE = _non_punct_class() NON_PUNCT_RANGE = _non_punct_class()
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))

File diff suppressed because one or more lines are too long