Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Jan 4, 2024
2 parents b1e5f91 + de57ac1 commit f8d208b
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 57 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"]
env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
include:
Expand All @@ -27,12 +28,16 @@ jobs:
python-version: 3.7
- os: macos-latest
python-version: 3.8
- os: macos-latest
python-version: "3.10"
- os: macos-latest
python-version: "3.12"
- os: windows-latest
python-version: 3.8
- os: ubuntu-latest
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.12-dev"
python-version: "3.12"
steps:
# Python and pip setup
- name: Set up Python ${{ matrix.python-version }}
Expand Down
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,12 @@ def get_long_description():
"charset_normalizer >= 3.0.1; python_version < '3.7'",
"charset_normalizer >= 3.2.0; python_version >= '3.7'",
"courlan >= 0.9.5",
"htmldate >= 1.6.0",
"htmldate >= 1.6.1",
"importlib_metadata; python_version < '3.8'",
"justext >= 3.0.0",
"lxml >= 4.9.3 ; platform_system != 'Darwin'",
"lxml == 4.9.2 ; platform_system == 'Darwin'",
# see tests on Github Actions
"lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
"psutil",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
"urllib3 >= 1.26, < 3; python_version >= '3.7'",
Expand Down
16 changes: 8 additions & 8 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_download():
#teststring = fetch_url(url)
#assert teststring is not None
#assert cli.examine(teststring, args, url) is None
url = 'https://httpbun.org/html'
url = 'https://httpbun.com/html'
teststring = fetch_url(url)
assert teststring is not None
assert cli.examine(teststring, args, url) is not None
Expand Down Expand Up @@ -408,27 +408,27 @@ def test_crawling():
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', 'https://httpbun.org/html']
testargs = ['', '--crawl', 'https://httpbun.com/html']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbun.org/html\n'
assert f.getvalue() == 'https://httpbun.com/html\n'

spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
testargs = ['', '--crawl', 'https://httpbun.com/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
# possibly a bug on Github actions, should be 2 URLs
assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
assert f.getvalue() in ('https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n', 'https://httpbun.com/links/1/1\n')
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
args.crawl = 'https://httpbun.org/links/4/4'
args.crawl = 'https://httpbun.com/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
Expand All @@ -437,13 +437,13 @@ def test_crawling():
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# Exploration (Sitemap + Crawl)
testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
testargs = ['', '--explore', 'https://httpbun.com/html', '--list']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == 'https://httpbun.org/html'
assert f.getvalue().strip() == 'https://httpbun.com/html'


def test_probing():
Expand Down
18 changes: 9 additions & 9 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,27 @@ def test_fetch():
assert _send_request('', True, DEFAULT_CONFIG) is None

# is_live general tests
assert _urllib3_is_live_page('https://httpbun.org/status/301') is True
assert _urllib3_is_live_page('https://httpbun.org/status/404') is False
assert is_live_page('https://httpbun.org/status/403') is False
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
assert _urllib3_is_live_page('https://httpbun.com/status/404') is False
assert is_live_page('https://httpbun.com/status/403') is False
# is_live pycurl tests
if pycurl is not None:
assert _pycurl_is_live_page('https://httpbun.org/status/301') is True
assert _pycurl_is_live_page('https://httpbun.com/status/301') is True

# fetch_url
assert fetch_url('#@1234') is None
assert fetch_url('https://httpbun.org/status/404') is None
assert fetch_url('https://httpbun.com/status/404') is None
# test if the functions default to no_ssl
# doesn't work?
# assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
if pycurl is not None:
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.org/status/200'
response = _send_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
url = 'https://httpbun.com/status/200'
response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert response.data == b''
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
# response object
Expand Down Expand Up @@ -155,7 +155,7 @@ def test_queue():
testargs = ['', '-v']
with patch.object(sys, 'argv', testargs):
args = parse_args(testargs)
inputurls = ['https://httpbun.org/status/301', 'https://httpbun.org/status/304', 'https://httpbun.org/status/200', 'https://httpbun.org/status/300', 'https://httpbun.org/status/400', 'https://httpbun.org/status/505']
inputurls = ['https://httpbun.com/status/301', 'https://httpbun.com/status/304', 'https://httpbun.com/status/200', 'https://httpbun.com/status/300', 'https://httpbun.com/status/400', 'https://httpbun.com/status/505']
url_store = add_to_compressed_dict(inputurls)
args.archived = True
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
Expand Down
23 changes: 21 additions & 2 deletions tests/feeds_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def test_feeds_helpers():
)
== 1
)

# no comments wanted
assert (
len(
Expand Down Expand Up @@ -254,9 +255,27 @@ def test_feeds_helpers():
assert determine_feed(
'<html><body><a href="https://example.org/rss"><body/></html>', params
) == ["https://example.org/rss"]
assert determine_feed(
'<html><body><a href="https://example.org/feeds/posts/default/"><body/></html>',
params,
) == ["https://example.org/feeds/posts/default/"]
assert (
len(
determine_feed(
'<html><body><a href="https://www.test.org/cat/?feed=rss" /><body/></html>',
params,
)
)
== 1
)
assert determine_feed(
'<html><body><a href="?feed=rss" /><body/></html>',
params,
) == ["https://example.org/?feed=rss"]

# feed discovery
assert not find_feed_urls("http://")
assert not find_feed_urls("https://httpbun.org/status/404")
assert not find_feed_urls("https://httpbun.com/status/404")
# Feedburner/Google links
assert handle_link_list(["https://feedproxy.google.com/ABCD"], params) == [
"https://feedproxy.google.com/ABCD"
Expand All @@ -271,7 +290,7 @@ def test_feeds_helpers():

def test_cli_behavior():
"""Test command-line interface with respect to feeds"""
testargs = ["", "--list", "--feed", "https://httpbun.org/xml"]
testargs = ["", "--list", "--feed", "https://httpbun.com/xml"]
with patch.object(sys, "argv", testargs):
assert main() is None

Expand Down
2 changes: 1 addition & 1 deletion tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_extraction():
def test_robotstxt():
'''Check if sitemaps can be found over robots.txt'''
assert not sitemaps.find_robots_sitemaps('https://http.org')
baseurl = 'https://httpbun.org'
baseurl = 'https://httpbun.com'
assert not sitemaps.find_robots_sitemaps(baseurl)
assert not sitemaps.extract_robots_sitemaps('# test', baseurl)
assert not sitemaps.extract_robots_sitemaps('# test'*10000, baseurl)
Expand Down
46 changes: 23 additions & 23 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,40 @@ def test_redirections():
"Test redirection detection."
_, _, baseurl = spider.probe_alternative_homepage('xyz')
assert baseurl is None
_, _, baseurl = spider.probe_alternative_homepage('https://httpbun.org/redirect-to?url=https://example.org')
_, _, baseurl = spider.probe_alternative_homepage('https://httpbun.com/redirect-to?url=https://example.org')
assert baseurl == 'https://example.org'
#_, _, baseurl = spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')


def test_meta_redirections():
"Test redirection detection using meta tag."
# empty
htmlstring, homepage = '"refresh"', 'https://httpbun.org/'
htmlstring, homepage = '"refresh"', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage
htmlstring, homepage = '<html></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# unusable
htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.org/'
htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# malformed
htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# wrong URL
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 is None and homepage2 is None

# normal
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.org/html"/></html>', 'http://test.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.com/html"/></html>', 'http://test.org/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 is not None and homepage2 == 'https://httpbun.org/html'
assert htmlstring2 is not None and homepage2 == 'https://httpbun.com/html'


def test_process_links():
Expand Down Expand Up @@ -103,7 +103,7 @@ def test_process_links():

def test_crawl_logic():
"Test functions related to crawling sequence and consistency."
url = 'https://httpbun.org/html'
url = 'https://httpbun.com/html'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# erroneous webpage
with pytest.raises(ValueError):
Expand All @@ -118,31 +118,31 @@ def test_crawl_logic():
base_url, i, known_num, rules, is_on = spider.init_crawl(url, None, None)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.org' and i == 1
assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.com' and i == 1
# delay between requests
assert spider.URL_STORE.get_crawl_delay('https://httpbun.org') == 5
assert spider.URL_STORE.get_crawl_delay('https://httpbun.org', default=2.0) == 2.0
assert spider.URL_STORE.get_crawl_delay('https://httpbun.com') == 5
assert spider.URL_STORE.get_crawl_delay('https://httpbun.com', default=2.0) == 2.0
# existing todo
spider.URL_STORE = UrlStore(compressed=False, strict=False)
base_url, i, known_num, rules, is_on = spider.init_crawl(url, [url,], None)
assert base_url == 'https://httpbun.org' and i == 0
assert base_url == 'https://httpbun.com' and i == 0


def test_crawl_page():
"Test page-by-page processing."
base_url = 'https://httpbun.org'
base_url = 'https://httpbun.com'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(['https://httpbun.org/links/2/2'])
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org')
spider.URL_STORE.add_urls(['https://httpbun.com/links/2/2'])
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com')
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert sorted(todo) == ['https://httpbun.org/links/2/0', 'https://httpbun.org/links/2/1']
assert sorted(todo) == ['https://httpbun.com/links/2/0', 'https://httpbun.com/links/2/1']
assert len(known_links) == 3 and visited_num == 1
# initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(['https://httpbun.org/html'])
spider.URL_STORE.add_urls(['https://httpbun.com/html'])
# if LANGID_FLAG is True:
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org', initial=True, lang='de')
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com', initial=True, lang='de')
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert len(todo) == 0 and len(known_links) == 1 and visited_num == 1
Expand All @@ -152,10 +152,10 @@ def test_crawl_page():
def test_focused_crawler():
"Test the whole focused crawler mechanism."
spider.URL_STORE = UrlStore()
todo, known_links = spider.focused_crawler("https://httpbun.org/links/1/1", max_seen_urls=1)
## TODO: check this on Github actions:
# assert sorted(known_links) == ['https://httpbun.org/links/1/0', 'https://httpbun.org/links/1/1']
# assert sorted(todo) == ['https://httpbun.org/links/1/0']
todo, known_links = spider.focused_crawler("https://httpbun.com/links/1/1", max_seen_urls=1)
## fails on Github Actions
## assert sorted(known_links) == ['https://httpbun.com/links/1/0', 'https://httpbun.com/links/1/1']
## assert sorted(todo) == ['https://httpbun.com/links/1/0']


if __name__ == '__main__':
Expand Down
5 changes: 4 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,10 @@ def test_external():
teststring = f.read()
assert extract(teststring, no_fallback=True, include_tables=False) == ''
assert extract(teststring, no_fallback=False, include_tables=False) == ''

# invalid XML attributes: namespace colon in attribute key (issue #375). Those attributes should be stripped
bad_xml = 'Testing<ul style="" padding:1px; margin:15px""><b>Features:</b> <li>Saves the cost of two dedicated phone lines.</li> al station using Internet or cellular technology.</li> <li>Requires no change to the existing Fire Alarm Control Panel configuration. The IPGSM-4G connects directly to the primary and secondary telephone ports.</li>'
res = extract(bad_xml, output_format='xml')
assert "Features" in res

def test_images():
'''Test image extraction function'''
Expand Down
15 changes: 11 additions & 4 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,20 @@
from courlan import UrlStore
from courlan.network import redirection_test

from . import __version__
try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version


from .settings import DEFAULT_CONFIG
from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
uniquify_list)


LOGGER = logging.getLogger(__name__)
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50
MAX_REDIRECTS = 2

Expand All @@ -42,11 +51,9 @@
RETRY_STRATEGY = None

DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = 'trafilatura/' + __version__ + ' (+https://github.com/adbar/trafilatura)'
USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT

LOGGER = logging.getLogger(__name__)

RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])


Expand Down
Loading

0 comments on commit f8d208b

Please sign in to comment.