from wordfreq_builder.word_counts import URL_RE
from nose.tools import eq_


def check_url(url):
    match = URL_RE.match(url)
    assert match
    eq_(match.span(), (0, len(url)))


def test_url_re():
    # URLs like this are all over the Arabic Wikipedia. Here's one with the
    # student ID blanked out.
    yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'

    yield check_url, 'https://example.com/űnicode.html'
    yield check_url, 'http://☃.net'

    assert not URL_RE.match('ftp://127.0.0.1')