diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 9fefdc8..dc540ac 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -49,7 +49,7 @@ def simple_tokenize(text): relatively untokenized. """ text = unicodedata.normalize('NFC', text) - return [token.casefold() for token in TOKEN_RE.findall(text)] + return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] def remove_arabic_marks(text): diff --git a/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py index a26feab..2fbc477 100644 --- a/wordfreq_builder/tests/test_tokenizer.py +++ b/wordfreq_builder/tests/test_tokenizer.py @@ -6,7 +6,7 @@ def test_tokenizer_1(): text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' tokens = [ 'this', 'is', 'a', 'test', 'she', 'said', - 'and', "i'll", 'bet', "y'all", '3', '50', 'that', + 'and', "i'll", 'bet', "y", "all", '3.50', 'that', 'it', "won't", 'fail', ] result = cld2_surface_tokenizer(text) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 5127108..55eff3d 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -6,6 +6,10 @@ import math import csv import msgpack import gzip +import regex + + +URL_RE = regex.compile(r'https?://(?:\B\S)+') def count_tokens(filename): @@ -18,6 +22,7 @@ def count_tokens(filename): counts = defaultdict(int) with open(filename, encoding='utf-8', errors='replace') as infile: for line in infile: + line = URL_RE.sub('', line.strip()) for token in simple_tokenize(line): counts[token] += 1