From 6652b8ac1833dffb4f3187da948146650bafa88a Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 19 Dec 2024 12:29:34 +0200 Subject: [PATCH 1/6] fix: solve the response log issue so now you get log for every response from Fetchers --- scrapling/engines/toolbelt/custom.py | 6 +----- scrapling/parser.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py index 62c8452..9705eb3 100644 --- a/scrapling/engines/toolbelt/custom.py +++ b/scrapling/engines/toolbelt/custom.py @@ -84,8 +84,6 @@ def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> class Response(Adaptor): """This class is returned by all engines as a way to unify response type between different libraries.""" - _is_response_result_logged = False # Class-level flag, initialized to False - def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict): automatch_domain = adaptor_arguments.pop('automatch_domain', None) @@ -99,9 +97,7 @@ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, c # For back-ward compatibility self.adaptor = self # For easier debugging while working from a Python shell - if not Response._is_response_result_logged: - log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})') - Response._is_response_result_logged = True + log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})') # def __repr__(self): # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>' diff --git a/scrapling/parser.py b/scrapling/parser.py index 5440a84..c82a270 100644 --- a/scrapling/parser.py +++ b/scrapling/parser.py @@ -155,7 +155,7 @@ def __get_correct_result( else: if issubclass(type(element), html.HtmlMixin): - return self.__class__( + return Adaptor( root=element, text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled, From 844cb0c6a23da4eb1e80f71a520eaf502c382982 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 19 Dec 2024 13:00:19 +0200 Subject: [PATCH 2/6] fix(PlaywrightFetcher): Wait for the real response status code This will return the real status response after redirection for websites that have Cloudflare instead of Playwright's default behavior. --- scrapling/engines/pw.py | 60 ++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index e4c80b7..e434032 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -193,12 +193,21 @@ def fetch(self, url: str) -> Response: :param url: Target url. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ + from playwright.sync_api import Response as PlaywrightResponse if not self.stealth or self.real_chrome: # Because rebrowser_playwright doesn't play well with real browsers from playwright.sync_api import sync_playwright else: from rebrowser_playwright.sync_api import sync_playwright + # Store the final response + final_response = None + + def handle_response(finished_response: PlaywrightResponse): + nonlocal final_response + if finished_response.request.resource_type == "document": + final_response = finished_response + with sync_playwright() as p: # Creating the browser if self.cdp_url: @@ -212,6 +221,8 @@ def fetch(self, url: str) -> Response: page = context.new_page() page.set_default_navigation_timeout(self.timeout) page.set_default_timeout(self.timeout) + # Listen for all responses + page.on("response", handle_response) if self.extra_headers: page.set_extra_http_headers(self.extra_headers) @@ -223,7 +234,7 @@ def fetch(self, url: str) -> Response: for script in self.__stealth_scripts(): page.add_init_script(path=script) - res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) + first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) page.wait_for_load_state(state="domcontentloaded") if self.network_idle: page.wait_for_load_state('networkidle') @@ -240,21 +251,24 @@ def fetch(self, url: str) -> Response: if self.network_idle: page.wait_for_load_state('networkidle') + response_bytes = final_response.body() if final_response else page.content().encode('utf-8') + # In case we didn't catch a document type somehow + final_response = final_response if final_response else first_response # This will be parsed inside `Response` - encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding + encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding # PlayWright API sometimes give empty status text for some reason! - status_text = res.status_text or StatusText.get(res.status) + status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=res.url, + url=final_response.url, text=page.content(), - body=page.content().encode('utf-8'), - status=res.status, + body=response_bytes, + status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, - headers=res.all_headers(), - request_headers=res.request.all_headers(), + headers=final_response.all_headers(), + request_headers=final_response.request.all_headers(), **self.adaptor_arguments ) page.close() @@ -266,12 +280,21 @@ async def async_fetch(self, url: str) -> Response: :param url: Target url. :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ + from playwright.async_api import Response as PlaywrightResponse if not self.stealth or self.real_chrome: # Because rebrowser_playwright doesn't play well with real browsers from playwright.async_api import async_playwright else: from rebrowser_playwright.async_api import async_playwright + # Store the final response + final_response = None + + async def handle_response(finished_response: PlaywrightResponse): + nonlocal final_response + if finished_response.request.resource_type == "document": + final_response = finished_response + async with async_playwright() as p: # Creating the browser if self.cdp_url: @@ -285,6 +308,8 @@ async def async_fetch(self, url: str) -> Response: page = await context.new_page() page.set_default_navigation_timeout(self.timeout) page.set_default_timeout(self.timeout) + # Listen for all responses + page.on("response", handle_response) if self.extra_headers: await page.set_extra_http_headers(self.extra_headers) @@ -296,7 +321,7 @@ async def async_fetch(self, url: str) -> Response: for script in self.__stealth_scripts(): await page.add_init_script(path=script) - res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) + first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) await page.wait_for_load_state(state="domcontentloaded") if self.network_idle: await page.wait_for_load_state('networkidle') @@ -313,21 +338,24 @@ async def async_fetch(self, url: str) -> Response: if self.network_idle: await page.wait_for_load_state('networkidle') + response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8') + # In case we didn't catch a document type somehow + final_response = final_response if final_response else first_response # This will be parsed inside `Response` - encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding + encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding # PlayWright API sometimes give empty status text for some reason! - status_text = res.status_text or StatusText.get(res.status) + status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=res.url, + url=final_response.url, text=await page.content(), - body=(await page.content()).encode('utf-8'), - status=res.status, + body=response_bytes, + status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()}, - headers=await res.all_headers(), - request_headers=await res.request.all_headers(), + headers=await final_response.all_headers(), + request_headers=await final_response.request.all_headers(), **self.adaptor_arguments ) await page.close() From 94745b155533502d55af8a18fe5097bf88ab575b Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 19 Dec 2024 13:08:41 +0200 Subject: [PATCH 3/6] fix(StealthyFetcher): Wait for the real response status code This will return the real status response after redirection for websites that have Cloudflare instead of Playwright API's default behavior. --- scrapling/engines/camo.py | 58 ++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index 1eb9976..2c0ff3d 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -84,6 +84,14 @@ def fetch(self, url: str) -> Response: :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ addons = [] if self.disable_ads else [DefaultAddons.UBO] + # Store the final response + final_response = None + + def handle_response(finished_response): + nonlocal final_response + if finished_response.request.resource_type == "document": + final_response = finished_response + with Camoufox( geoip=self.geoip, proxy=self.proxy, @@ -100,13 +108,15 @@ def fetch(self, url: str) -> Response: page = browser.new_page() page.set_default_navigation_timeout(self.timeout) page.set_default_timeout(self.timeout) + # Listen for all responses + page.on("response", handle_response) if self.disable_resources: page.route("**/*", intercept_route) if self.extra_headers: page.set_extra_http_headers(self.extra_headers) - res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) + first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) page.wait_for_load_state(state="domcontentloaded") if self.network_idle: page.wait_for_load_state('networkidle') @@ -123,21 +133,24 @@ def fetch(self, url: str) -> Response: if self.network_idle: page.wait_for_load_state('networkidle') + response_bytes = final_response.body() if final_response else page.content().encode('utf-8') + # In case we didn't catch a document type somehow + final_response = final_response if final_response else first_response # This will be parsed inside `Response` - encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding + encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding # PlayWright API sometimes give empty status text for some reason! - status_text = res.status_text or StatusText.get(res.status) + status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=res.url, + url=final_response.url, text=page.content(), - body=page.content().encode('utf-8'), - status=res.status, + body=response_bytes, + status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, - headers=res.all_headers(), - request_headers=res.request.all_headers(), + headers=final_response.all_headers(), + request_headers=final_response.request.all_headers(), **self.adaptor_arguments ) page.close() @@ -151,6 +164,14 @@ async def async_fetch(self, url: str) -> Response: :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ addons = [] if self.disable_ads else [DefaultAddons.UBO] + # Store the final response + final_response = None + + async def handle_response(finished_response): + nonlocal final_response + if finished_response.request.resource_type == "document": + final_response = finished_response + async with AsyncCamoufox( geoip=self.geoip, proxy=self.proxy, @@ -167,13 +188,15 @@ async def async_fetch(self, url: str) -> Response: page = await browser.new_page() page.set_default_navigation_timeout(self.timeout) page.set_default_timeout(self.timeout) + # Listen for all responses + page.on("response", handle_response) if self.disable_resources: await page.route("**/*", async_intercept_route) if self.extra_headers: await page.set_extra_http_headers(self.extra_headers) - res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) + first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) await page.wait_for_load_state(state="domcontentloaded") if self.network_idle: await page.wait_for_load_state('networkidle') @@ -190,21 +213,24 @@ async def async_fetch(self, url: str) -> Response: if self.network_idle: await page.wait_for_load_state('networkidle') + response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8') + # In case we didn't catch a document type somehow + final_response = final_response if final_response else first_response # This will be parsed inside `Response` - encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding + encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding # PlayWright API sometimes give empty status text for some reason! - status_text = res.status_text or StatusText.get(res.status) + status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=res.url, + url=final_response.url, text=await page.content(), - body=(await page.content()).encode('utf-8'), - status=res.status, + body=response_bytes, + status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()}, - headers=await res.all_headers(), - request_headers=await res.request.all_headers(), + headers=await final_response.all_headers(), + request_headers=await final_response.request.all_headers(), **self.adaptor_arguments ) await page.close() From 5be8c2a95edfd2ef1f2826d343fce4b369d564b5 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 19 Dec 2024 13:22:12 +0200 Subject: [PATCH 4/6] style(Fetchers): Setting correct type hint for `wait_selector_state` argument --- scrapling/core/_types.py | 2 ++ scrapling/engines/camo.py | 4 ++-- scrapling/engines/pw.py | 5 +++-- scrapling/fetchers.py | 14 +++++++------- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/scrapling/core/_types.py b/scrapling/core/_types.py index 8816b90..84e9a51 100644 --- a/scrapling/core/_types.py +++ b/scrapling/core/_types.py @@ -5,6 +5,8 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Pattern, Tuple, Type, Union) +SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"] + try: from typing import Protocol except ImportError: diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index 2c0ff3d..f1d8f95 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -3,7 +3,7 @@ from camoufox.sync_api import Camoufox from scrapling.core._types import (Callable, Dict, List, Literal, Optional, - Union) + SelectorWaitStates, Union) from scrapling.core.utils import log from scrapling.engines.toolbelt import (Response, StatusText, async_intercept_route, @@ -18,7 +18,7 @@ def __init__( self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False, block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True, timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None, - wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, + wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False, adaptor_arguments: Dict = None, diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index e434032..0a6d30d 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -1,6 +1,7 @@ import json -from scrapling.core._types import Callable, Dict, Optional, Union +from scrapling.core._types import (Callable, Dict, Optional, + SelectorWaitStates, Union) from scrapling.core.utils import log, lru_cache from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY) @@ -23,7 +24,7 @@ def __init__( page_action: Callable = None, wait_selector: Optional[str] = None, locale: Optional[str] = 'en-US', - wait_selector_state: Optional[str] = 'attached', + wait_selector_state: SelectorWaitStates = 'attached', stealth: Optional[bool] = False, real_chrome: Optional[bool] = False, hide_canvas: Optional[bool] = False, diff --git a/scrapling/fetchers.py b/scrapling/fetchers.py index 86f77ae..bdf87a6 100644 --- a/scrapling/fetchers.py +++ b/scrapling/fetchers.py @@ -1,5 +1,5 @@ from scrapling.core._types import (Callable, Dict, List, Literal, Optional, - Union) + SelectorWaitStates, Union) from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable) from scrapling.engines.toolbelt import BaseFetcher, Response @@ -176,8 +176,8 @@ def fetch( self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False, block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None, timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True, - wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None, - os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False, + wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, + proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False, ) -> Response: """ Opens up a browser and do your request based on your chosen options below. @@ -234,8 +234,8 @@ async def async_fetch( self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False, block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None, timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True, - wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None, - os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False, + wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, + proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False, ) -> Response: """ Opens up a browser and do your request based on your chosen options below. @@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher): def fetch( self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None, useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000, - page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', + page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached', hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US', stealth: Optional[bool] = False, real_chrome: Optional[bool] = False, @@ -368,7 +368,7 @@ def fetch( async def async_fetch( self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None, useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000, - page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached', + page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached', hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True, proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US', stealth: Optional[bool] = False, real_chrome: Optional[bool] = False, From f74db648bc5c3861f3efb2314628cb641c3c69d1 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 19 Dec 2024 13:29:45 +0200 Subject: [PATCH 5/6] build: Pumping up the version to 0.2.91 --- scrapling/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapling/__init__.py b/scrapling/__init__.py index b8c7396..eeba624 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -5,7 +5,7 @@ from scrapling.parser import Adaptor, Adaptors __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.2.9" +__version__ = "0.2.91" __copyright__ = "Copyright (c) 2024 Karim Shoair" diff --git a/setup.cfg b/setup.cfg index b8b9227..700f9d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.2.9 +version = 0.2.91 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python. diff --git a/setup.py b/setup.py index baa9429..1a765fb 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scrapling", - version="0.2.9", + version="0.2.91", description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It simplifies the process of extracting data from websites, even when they undergo structural changes, and offers impressive speed improvements over many popular scraping tools.""", @@ -37,7 +37,6 @@ "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", From bb398693b87d5bfbeb6f4b9270dd1e3cbbd64461 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 19 Dec 2024 13:30:43 +0200 Subject: [PATCH 6/6] feat(Fetcher): Adding support to proxies using the SOCKS protocol --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1a765fb..45b88ad 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "w3lib", "orjson>=3", "tldextract", - 'httpx[brotli,zstd]', + 'httpx[brotli,zstd, socks]', 'playwright>=1.49.1', 'rebrowser-playwright>=1.49.1', 'camoufox[geoip]>=0.4.9'