fix URL expression

Former-commit-id: c4a2594217
2024-12-23 17:31:41 +00:00 · 2015-08-26 15:00:46 -04:00 · 2015-08-26 15:00:46 -04:00 · af29fc4f88
commit af29fc4f88
parent e463397edf
2 changed files with 25 additions and 2 deletions
--- a/wordfreq_builder/tests/test_urls.py
+++ b/wordfreq_builder/tests/test_urls.py
@ -0,0 +1,20 @@
+from wordfreq_builder.word_counts import URL_RE
+from nose.tools import eq_
+
+
+def check_url(url):
+    match = URL_RE.match(url)
+    assert match
+    eq_(match.span(), (0, len(url)))
+
+
+def test_url_re():
+    # URLs like this are all over the Arabic Wikipedia. Here's one with the
+    # student ID blanked out.
+    yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
+
+    yield check_url, 'https://example.com/űnicode.html'
+    yield check_url, 'http://☃.net'
+
+    assert not URL_RE.match('ftp://127.0.0.1')
+
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -9,7 +9,9 @@ import gzip
 import regex


-URL_RE = regex.compile(r'https?://(?:\B\S)+')
+# Match common cases of URLs: the schema http:// or https:// followed by
+# non-whitespace characters.
+URL_RE = regex.compile(r'https?://(?:\S)+')


 def count_tokens(filename):
@ -17,7 +19,8 @@ def count_tokens(filename):
    Count tokens that appear in a file, running each line through our
    simple tokenizer.

-    Unicode errors in the input data will become token boundaries.
+    URLs will be skipped, and Unicode errors will become separate tokens
+    containing '<EFBFBD>'.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile: