now uses ranges

Former-commit-id: f3a365fda9
2024-12-24 09:51:38 +00:00 · 2015-06-25 10:39:04 -04:00 · 2015-06-25 10:39:04 -04:00 · d4b5530d0e
commit d4b5530d0e
parent 44e7fb5b70
2 changed files with 24 additions and 6 deletions
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -3,6 +3,7 @@ from functools import lru_cache
 import unicodedata
 from ftfy import chardata
 import langcodes
 import itertools
 import msgpack
 import re
 import gzip
@ -45,21 +46,38 @@ def _non_punct_class():
    This will classify symbols, including emoji, as punctuation; callers that
    want to treat emoji separately should filter them out first.
    """
    try:
-        with open('data/non_punct.txt') as file:
+        with open('wordfreq/data/non_punct.txt') as file:
            return file.read()
    except FileNotFoundError:
-        non_punct = [chr(x) for x in range(0x110000)
+        non_punct = [x for x in range(0x110000)
                        if unicodedata.category(chr(x))[0] not in 'PSZMC']
-        out = '[%s]' % ''.join(non_punct)
+        non_punct_ranges = to_ranges(non_punct)
-        with open('non_punct.txt', mode='w') as file:
+        out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
                for start, end in non_punct_ranges)
        with open('wordfreq/data/non_punct.txt', mode='w') as file:
            file.write(out)
        return out
 def to_ranges(seq):
    """
    Converts a sequence of int's into a list of inclusives ranges
    """
    ranges = []
    start_range = seq[0]
    for previous, elem in zip(seq, seq[1:]):
        if elem - previous != 1:
            ranges.append((start_range, previous))
            start_range = elem
    ranges.append((start_range, seq[-1]))
    return ranges
 NON_PUNCT_RANGE = _non_punct_class()
 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt