now uses ranges

Former-commit-id: f3a365fda9
2024-12-24 18:01:38 +00:00 · 2015-06-25 10:39:04 -04:00 · 2015-06-25 10:39:04 -04:00 · 1d2615f6d6
commit 1d2615f6d6
parent 50398a8ce1
2 changed files with 24 additions and 6 deletions
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -3,6 +3,7 @@ from functools import lru_cache
 import unicodedata
 from ftfy import chardata
 import langcodes
+import itertools
 import msgpack
 import re
 import gzip
@ -45,21 +46,38 @@ def _non_punct_class():
    This will classify symbols, including emoji, as punctuation; callers that
    want to treat emoji separately should filter them out first.
    """
-
    try:
-        with open('data/non_punct.txt') as file:
+        with open('wordfreq/data/non_punct.txt') as file:
            return file.read()
    except FileNotFoundError:
-        non_punct = [chr(x) for x in range(0x110000)
+        non_punct = [x for x in range(0x110000)
                        if unicodedata.category(chr(x))[0] not in 'PSZMC']

-        out = '[%s]' % ''.join(non_punct)
+        non_punct_ranges = to_ranges(non_punct)

-        with open('non_punct.txt', mode='w') as file:
+        out = '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
+                for start, end in non_punct_ranges)
+
+        with open('wordfreq/data/non_punct.txt', mode='w') as file:
            file.write(out)

        return out

+def to_ranges(seq):
+    """
+    Converts a sequence of int's into a list of inclusives ranges
+    """
+    ranges = []
+    start_range = seq[0]
+    for previous, elem in zip(seq, seq[1:]):
+        if elem - previous != 1:
+            ranges.append((start_range, previous))
+            start_range = elem
+    ranges.append((start_range, seq[-1]))
+    return ranges
+
+
+
 NON_PUNCT_RANGE = _non_punct_class()

 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt