Strip apostrophes from edges of tokens

The issue here is that if you had French text with an apostrophe, such as "d'un", it would split it into "d'" and "un", but if "d'" were re-tokenized it would come out as "d". Stripping apostrophes makes the process more idempotent.
2024-12-23 17:31:41 +00:00 · 2015-08-25 12:41:48 -04:00 · 2015-08-25 12:41:48 -04:00 · 5a1fc00aaa
commit 5a1fc00aaa
parent a8e7c29068
3 changed files with 7 additions and 2 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -49,7 +49,7 @@ def simple_tokenize(text):
    relatively untokenized.
    """
    text = unicodedata.normalize('NFC', text)
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 def remove_arabic_marks(text):
--- a/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/tests/test_tokenizer.py
@ -6,7 +6,7 @@ def test_tokenizer_1():
    text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
    tokens = [
        'this', 'is', 'a', 'test', 'she', 'said',
-        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
+        'and', "i'll", 'bet', "y", "all", '3.50', 'that',
        'it', "won't", 'fail',
    ]
    result = cld2_surface_tokenizer(text)
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -6,6 +6,10 @@ import math
 import csv
 import msgpack
 import gzip
 import regex
 URL_RE = regex.compile(r'https?://(?:\B\S)+')
 def count_tokens(filename):
@ -18,6 +22,7 @@ def count_tokens(filename):
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
            line = URL_RE.sub('', line.strip())
            for token in simple_tokenize(line):
                counts[token] += 1