Strip apostrophes from edges of tokens

The issue here is that if you had French text with an apostrophe, such as "d'un", it would split it into "d'" and "un", but if "d'" were re-tokenized it would come out as "d". Stripping apostrophes makes the process more idempotent. Former-commit-id: 5a1fc00aaa
2024-12-23 09:21:37 +00:00 · 2015-08-25 12:41:48 -04:00 · 2015-08-25 12:41:48 -04:00 · b22a4b0f02
commit b22a4b0f02
parent 0b282c5055
3 changed files with 7 additions and 2 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -49,7 +49,7 @@ def simple_tokenize(text):
    relatively untokenized.
    """
    text = unicodedata.normalize('NFC', text)
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]


 def remove_arabic_marks(text):
--- a/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/tests/test_tokenizer.py
@ -6,7 +6,7 @@ def test_tokenizer_1():
    text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
    tokens = [
        'this', 'is', 'a', 'test', 'she', 'said',
-        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
+        'and', "i'll", 'bet', "y", "all", '3.50', 'that',
        'it', "won't", 'fail',
    ]
    result = cld2_surface_tokenizer(text)
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -6,6 +6,10 @@ import math
 import csv
 import msgpack
 import gzip
+import regex
+
+
+URL_RE = regex.compile(r'https?://(?:\B\S)+')


 def count_tokens(filename):
@ -18,6 +22,7 @@ def count_tokens(filename):
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
+            line = URL_RE.sub('', line.strip())
            for token in simple_tokenize(line):
                counts[token] += 1