From 5a1fc00aaabd8d121a9f1e71ac6224141c8badfe Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Tue, 25 Aug 2015 12:41:48 -0400
Subject: [PATCH] Strip apostrophes from edges of tokens

The issue here is that if you had French text with an apostrophe,
such as "d'un", it would split it into "d'" and "un", but if "d'"
were re-tokenized it would come out as "d". Stripping apostrophes
makes the process more idempotent.
---
 wordfreq/tokens.py                               | 2 +-
 wordfreq_builder/tests/test_tokenizer.py         | 2 +-
 wordfreq_builder/wordfreq_builder/word_counts.py | 5 +++++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 9fefdc8..dc540ac 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -49,7 +49,7 @@ def simple_tokenize(text):
     relatively untokenized.
     """
     text = unicodedata.normalize('NFC', text)
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 
 
 def remove_arabic_marks(text):
diff --git a/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py
index a26feab..2fbc477 100644
--- a/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/tests/test_tokenizer.py
@@ -6,7 +6,7 @@ def test_tokenizer_1():
     text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
     tokens = [
         'this', 'is', 'a', 'test', 'she', 'said',
-        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
+        'and', "i'll", 'bet', "y", "all", '3.50', 'that',
         'it', "won't", 'fail',
     ]
     result = cld2_surface_tokenizer(text)
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 5127108..55eff3d 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -6,6 +6,10 @@ import math
 import csv
 import msgpack
 import gzip
+import regex
+
+
+URL_RE = regex.compile(r'https?://(?:\B\S)+')
 
 
 def count_tokens(filename):
@@ -18,6 +22,7 @@ def count_tokens(filename):
     counts = defaultdict(int)
     with open(filename, encoding='utf-8', errors='replace') as infile:
         for line in infile:
+            line = URL_RE.sub('', line.strip())
             for token in simple_tokenize(line):
                 counts[token] += 1