Merge branch 'master' into master

adbar · Jan 4, 2024 · f8d208b · f8d208b
2 parents b1e5f91 + de57ac1
commit f8d208b
Show file tree

Hide file tree

Showing 12 changed files with 125 additions and 57 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -17,6 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
+        # https://github.com/actions/python-versions/blob/main/versions-manifest.json
         python-version: ["3.9", "3.11"]
         env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
         include:
@@ -27,12 +28,16 @@ jobs:
             python-version: 3.7
           - os: macos-latest
             python-version: 3.8
+          - os: macos-latest
+            python-version: "3.10"
+          - os: macos-latest
+            python-version: "3.12"
           - os: windows-latest
             python-version: 3.8
           - os: ubuntu-latest
             python-version: "3.10"
           - os: ubuntu-latest
-            python-version: "3.12-dev"
+            python-version: "3.12"
     steps:
     # Python and pip setup
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/setup.py b/setup.py
@@ -112,10 +112,12 @@ def get_long_description():
         "charset_normalizer >= 3.0.1; python_version < '3.7'",
         "charset_normalizer >= 3.2.0; python_version >= '3.7'",
         "courlan >= 0.9.5",
-        "htmldate >= 1.6.0",
+        "htmldate >= 1.6.1",
+        "importlib_metadata; python_version < '3.8'",
         "justext >= 3.0.0",
-        "lxml >= 4.9.3 ; platform_system != 'Darwin'",
-        "lxml == 4.9.2 ; platform_system == 'Darwin'",
+        # see tests on Github Actions
+        "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
+        "lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
         "psutil",
         "urllib3 >= 1.26, < 2; python_version < '3.7'",
         "urllib3 >= 1.26, < 3; python_version >= '3.7'",

diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -218,7 +218,7 @@ def test_download():
     #teststring = fetch_url(url)
     #assert teststring is not None
     #assert cli.examine(teststring, args, url) is None
-    url = 'https://httpbun.org/html'
+    url = 'https://httpbun.com/html'
     teststring = fetch_url(url)
     assert teststring is not None
     assert cli.examine(teststring, args, url) is not None
@@ -408,27 +408,27 @@ def test_crawling():
         args = cli.parse_args(testargs)
     cli_utils.cli_crawler(args)
 
-    testargs = ['', '--crawl', 'https://httpbun.org/html']
+    testargs = ['', '--crawl', 'https://httpbun.com/html']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
-    assert f.getvalue() == 'https://httpbun.org/html\n'
+    assert f.getvalue() == 'https://httpbun.com/html\n'
 
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     # links permitted
-    testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
+    testargs = ['', '--crawl', 'https://httpbun.com/links/1/1', '--list', '--parallel', '1']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
     # possibly a bug on Github actions, should be 2 URLs
-    assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
+    assert f.getvalue() in ('https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n', 'https://httpbun.com/links/1/1\n')
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     # 0 links permitted
-    args.crawl = 'https://httpbun.org/links/4/4'
+    args.crawl = 'https://httpbun.com/links/4/4'
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args, n=0)
@@ -437,13 +437,13 @@ def test_crawling():
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
 
     # Exploration (Sitemap + Crawl)
-    testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
+    testargs = ['', '--explore', 'https://httpbun.com/html', '--list']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
-    assert f.getvalue().strip() == 'https://httpbun.org/html'
+    assert f.getvalue().strip() == 'https://httpbun.com/html'
 
 
 def test_probing():

diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py
@@ -53,27 +53,27 @@ def test_fetch():
     assert _send_request('', True, DEFAULT_CONFIG) is None
 
     # is_live general tests
-    assert _urllib3_is_live_page('https://httpbun.org/status/301') is True
-    assert _urllib3_is_live_page('https://httpbun.org/status/404') is False
-    assert is_live_page('https://httpbun.org/status/403') is False
+    assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
+    assert _urllib3_is_live_page('https://httpbun.com/status/404') is False
+    assert is_live_page('https://httpbun.com/status/403') is False
     # is_live pycurl tests
     if pycurl is not None:
-        assert _pycurl_is_live_page('https://httpbun.org/status/301') is True
+        assert _pycurl_is_live_page('https://httpbun.com/status/301') is True
 
     # fetch_url
     assert fetch_url('#@1234') is None
-    assert fetch_url('https://httpbun.org/status/404') is None
+    assert fetch_url('https://httpbun.com/status/404') is None
     # test if the functions default to no_ssl
     # doesn't work?
     # assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
     if pycurl is not None:
         assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
     # no SSL, no decoding
-    url = 'https://httpbun.org/status/200'
-    response = _send_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
+    url = 'https://httpbun.com/status/200'
+    response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
     assert response.data == b''
     if pycurl is not None:
-        response1 = _send_pycurl_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
+        response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
         assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
         assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
     # response object
@@ -155,7 +155,7 @@ def test_queue():
     testargs = ['', '-v']
     with patch.object(sys, 'argv', testargs):
         args = parse_args(testargs)
-    inputurls = ['https://httpbun.org/status/301', 'https://httpbun.org/status/304', 'https://httpbun.org/status/200', 'https://httpbun.org/status/300', 'https://httpbun.org/status/400', 'https://httpbun.org/status/505']
+    inputurls = ['https://httpbun.com/status/301', 'https://httpbun.com/status/304', 'https://httpbun.com/status/200', 'https://httpbun.com/status/300', 'https://httpbun.com/status/400', 'https://httpbun.com/status/505']
     url_store = add_to_compressed_dict(inputurls)
     args.archived = True
     args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')

diff --git a/tests/feeds_tests.py b/tests/feeds_tests.py
@@ -220,6 +220,7 @@ def test_feeds_helpers():
         )
         == 1
     )
+
     # no comments wanted
     assert (
         len(
@@ -254,9 +255,27 @@ def test_feeds_helpers():
     assert determine_feed(
         '<html><body><a href="https://example.org/rss"><body/></html>', params
     ) == ["https://example.org/rss"]
+    assert determine_feed(
+        '<html><body><a href="https://example.org/feeds/posts/default/"><body/></html>',
+        params,
+    ) == ["https://example.org/feeds/posts/default/"]
+    assert (
+        len(
+            determine_feed(
+                '<html><body><a href="https://www.test.org/cat/?feed=rss" /><body/></html>',
+                params,
+            )
+        )
+        == 1
+    )
+    assert determine_feed(
+        '<html><body><a href="?feed=rss" /><body/></html>',
+        params,
+    ) == ["https://example.org/?feed=rss"]
+
     # feed discovery
     assert not find_feed_urls("http://")
-    assert not find_feed_urls("https://httpbun.org/status/404")
+    assert not find_feed_urls("https://httpbun.com/status/404")
     # Feedburner/Google links
     assert handle_link_list(["https://feedproxy.google.com/ABCD"], params) == [
         "https://feedproxy.google.com/ABCD"
@@ -271,7 +290,7 @@ def test_feeds_helpers():
 
 def test_cli_behavior():
     """Test command-line interface with respect to feeds"""
-    testargs = ["", "--list", "--feed", "https://httpbun.org/xml"]
+    testargs = ["", "--list", "--feed", "https://httpbun.com/xml"]
     with patch.object(sys, "argv", testargs):
         assert main() is None
 

diff --git a/tests/sitemaps_tests.py b/tests/sitemaps_tests.py
@@ -155,7 +155,7 @@ def test_extraction():
 def test_robotstxt():
     '''Check if sitemaps can be found over robots.txt'''
     assert not sitemaps.find_robots_sitemaps('https://http.org')
-    baseurl = 'https://httpbun.org'
+    baseurl = 'https://httpbun.com'
     assert not sitemaps.find_robots_sitemaps(baseurl)
     assert not sitemaps.extract_robots_sitemaps('# test', baseurl)
     assert not sitemaps.extract_robots_sitemaps('# test'*10000, baseurl)

diff --git a/tests/spider_tests.py b/tests/spider_tests.py
@@ -27,40 +27,40 @@ def test_redirections():
     "Test redirection detection."
     _, _, baseurl = spider.probe_alternative_homepage('xyz')
     assert baseurl is None
-    _, _, baseurl = spider.probe_alternative_homepage('https://httpbun.org/redirect-to?url=https://example.org')
+    _, _, baseurl = spider.probe_alternative_homepage('https://httpbun.com/redirect-to?url=https://example.org')
     assert baseurl == 'https://example.org'
     #_, _, baseurl = spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')
 
 
 def test_meta_redirections():
     "Test redirection detection using meta tag."
     # empty
-    htmlstring, homepage = '"refresh"', 'https://httpbun.org/'
+    htmlstring, homepage = '"refresh"', 'https://httpbun.com/'
     htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
     assert htmlstring2 == htmlstring and homepage2 == homepage
-    htmlstring, homepage = '<html></html>', 'https://httpbun.org/'
+    htmlstring, homepage = '<html></html>', 'https://httpbun.com/'
     htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
     assert htmlstring2 == htmlstring and homepage2 == homepage
 
     # unusable
-    htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.org/'
+    htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.com/'
     htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
     assert htmlstring2 == htmlstring and homepage2 == homepage
 
     # malformed
-    htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.org/'
+    htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.com/'
     htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
     assert htmlstring2 == htmlstring and homepage2 == homepage
 
     # wrong URL
-    htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.org/'
+    htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.com/'
     htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
     assert htmlstring2 is None and homepage2 is None
 
     # normal
-    htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.org/html"/></html>', 'http://test.org/'
+    htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.com/html"/></html>', 'http://test.org/'
     htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
-    assert htmlstring2 is not None and homepage2 == 'https://httpbun.org/html'
+    assert htmlstring2 is not None and homepage2 == 'https://httpbun.com/html'
 
 
 def test_process_links():
@@ -103,7 +103,7 @@ def test_process_links():
 
 def test_crawl_logic():
     "Test functions related to crawling sequence and consistency."
-    url = 'https://httpbun.org/html'
+    url = 'https://httpbun.com/html'
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     # erroneous webpage
     with pytest.raises(ValueError):
@@ -118,31 +118,31 @@ def test_crawl_logic():
     base_url, i, known_num, rules, is_on = spider.init_crawl(url, None, None)
     todo = spider.URL_STORE.find_unvisited_urls(base_url)
     known_links = spider.URL_STORE.find_known_urls(base_url)
-    assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.org' and i == 1
+    assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.com' and i == 1
     # delay between requests
-    assert spider.URL_STORE.get_crawl_delay('https://httpbun.org') == 5
-    assert spider.URL_STORE.get_crawl_delay('https://httpbun.org', default=2.0) == 2.0
+    assert spider.URL_STORE.get_crawl_delay('https://httpbun.com') == 5
+    assert spider.URL_STORE.get_crawl_delay('https://httpbun.com', default=2.0) == 2.0
     # existing todo
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     base_url, i, known_num, rules, is_on = spider.init_crawl(url, [url,], None)
-    assert base_url == 'https://httpbun.org' and i == 0
+    assert base_url == 'https://httpbun.com' and i == 0
 
 
 def test_crawl_page():
     "Test page-by-page processing."
-    base_url = 'https://httpbun.org'
+    base_url = 'https://httpbun.com'
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    spider.URL_STORE.add_urls(['https://httpbun.org/links/2/2'])
-    is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org')
+    spider.URL_STORE.add_urls(['https://httpbun.com/links/2/2'])
+    is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com')
     todo = spider.URL_STORE.find_unvisited_urls(base_url)
     known_links = spider.URL_STORE.find_known_urls(base_url)
-    assert sorted(todo) == ['https://httpbun.org/links/2/0', 'https://httpbun.org/links/2/1']
+    assert sorted(todo) == ['https://httpbun.com/links/2/0', 'https://httpbun.com/links/2/1']
     assert len(known_links) == 3 and visited_num == 1
     # initial page
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    spider.URL_STORE.add_urls(['https://httpbun.org/html'])
+    spider.URL_STORE.add_urls(['https://httpbun.com/html'])
     # if LANGID_FLAG is True:
-    is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org', initial=True, lang='de')
+    is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com', initial=True, lang='de')
     todo = spider.URL_STORE.find_unvisited_urls(base_url)
     known_links = spider.URL_STORE.find_known_urls(base_url)
     assert len(todo) == 0 and len(known_links) == 1 and visited_num == 1
@@ -152,10 +152,10 @@ def test_crawl_page():
 def test_focused_crawler():
     "Test the whole focused crawler mechanism."
     spider.URL_STORE = UrlStore()
-    todo, known_links = spider.focused_crawler("https://httpbun.org/links/1/1", max_seen_urls=1)
-    ## TODO: check this on Github actions:
-    # assert sorted(known_links) == ['https://httpbun.org/links/1/0', 'https://httpbun.org/links/1/1']
-    # assert sorted(todo) == ['https://httpbun.org/links/1/0']
+    todo, known_links = spider.focused_crawler("https://httpbun.com/links/1/1", max_seen_urls=1)
+    ## fails on Github Actions
+    ## assert sorted(known_links) == ['https://httpbun.com/links/1/0', 'https://httpbun.com/links/1/1']
+    ## assert sorted(todo) == ['https://httpbun.com/links/1/0']
 
 
 if __name__ == '__main__':

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -389,7 +389,10 @@ def test_external():
         teststring = f.read()
     assert extract(teststring, no_fallback=True, include_tables=False) == ''
     assert extract(teststring, no_fallback=False, include_tables=False) == ''
-
+    # invalid XML attributes: namespace colon in attribute key (issue #375). Those attributes should be stripped
+    bad_xml = 'Testing<ul style="" padding:1px; margin:15px""><b>Features:</b> <li>Saves the cost of two dedicated phone lines.</li> al station using Internet or cellular technology.</li> <li>Requires no change to the existing Fire Alarm Control Panel configuration. The IPGSM-4G connects directly to the primary and secondary telephone ports.</li>'
+    res = extract(bad_xml, output_format='xml')
+    assert "Features" in res
 
 def test_images():
     '''Test image extraction function'''

diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
@@ -28,11 +28,20 @@
 from courlan import UrlStore
 from courlan.network import redirection_test
 
-from . import __version__
+try:  # Python 3.8+
+    from importlib.metadata import version
+except ImportError:
+    from importlib_metadata import version
+
+
 from .settings import DEFAULT_CONFIG
 from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
                     uniquify_list)
 
+
+LOGGER = logging.getLogger(__name__)
+PKG_VERSION = version("trafilatura")
+
 NUM_CONNECTIONS = 50
 MAX_REDIRECTS = 2
 
@@ -42,11 +51,9 @@
 RETRY_STRATEGY = None
 
 DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
-USER_AGENT = 'trafilatura/' + __version__ + ' (+https://github.com/adbar/trafilatura)'
+USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
 DEFAULT_HEADERS['User-Agent'] = USER_AGENT
 
-LOGGER = logging.getLogger(__name__)
-
 RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])