diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 9fefdc8..dc540ac 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -49,7 +49,7 @@ def simple_tokenize(text):
     relatively untokenized.
     """
     text = unicodedata.normalize('NFC', text)
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 
 
 def remove_arabic_marks(text):
diff --git a/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py
index a26feab..2fbc477 100644
--- a/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/tests/test_tokenizer.py
@@ -6,7 +6,7 @@ def test_tokenizer_1():
     text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
     tokens = [
         'this', 'is', 'a', 'test', 'she', 'said',
-        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
+        'and', "i'll", 'bet', "y", "all", '3.50', 'that',
         'it', "won't", 'fail',
     ]
     result = cld2_surface_tokenizer(text)
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 5127108..55eff3d 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -6,6 +6,10 @@ import math
 import csv
 import msgpack
 import gzip
+import regex
+
+
+URL_RE = regex.compile(r'https?://(?:\B\S)+')
 
 
 def count_tokens(filename):
@@ -18,6 +22,7 @@ def count_tokens(filename):
     counts = defaultdict(int)
     with open(filename, encoding='utf-8', errors='replace') as infile:
         for line in infile:
+            line = URL_RE.sub('', line.strip())
             for token in simple_tokenize(line):
                 counts[token] += 1