Skip to content

Commit

Permalink
Merge pull request #26 from D4Vinci/dev
Browse files Browse the repository at this point in the history
v0.2.91
  • Loading branch information
D4Vinci authored Dec 19, 2024
2 parents 60df72c + bb39869 commit ee59914
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 54 deletions.
2 changes: 1 addition & 1 deletion scrapling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from scrapling.parser import Adaptor, Adaptors

__author__ = "Karim Shoair ([email protected])"
__version__ = "0.2.9"
__version__ = "0.2.91"
__copyright__ = "Copyright (c) 2024 Karim Shoair"


Expand Down
2 changes: 2 additions & 0 deletions scrapling/core/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
List, Literal, Optional, Pattern, Tuple, Type, Union)

SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]

try:
from typing import Protocol
except ImportError:
Expand Down
62 changes: 44 additions & 18 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from camoufox.sync_api import Camoufox

from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
Union)
SelectorWaitStates, Union)
from scrapling.core.utils import log
from scrapling.engines.toolbelt import (Response, StatusText,
async_intercept_route,
Expand All @@ -18,7 +18,7 @@ def __init__(
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
geoip: Optional[bool] = False,
adaptor_arguments: Dict = None,
Expand Down Expand Up @@ -84,6 +84,14 @@ def fetch(self, url: str) -> Response:
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
addons = [] if self.disable_ads else [DefaultAddons.UBO]
# Store the final response
final_response = None

def handle_response(finished_response):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

with Camoufox(
geoip=self.geoip,
proxy=self.proxy,
Expand All @@ -100,13 +108,15 @@ def fetch(self, url: str) -> Response:
page = browser.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)
if self.disable_resources:
page.route("**/*", intercept_route)

if self.extra_headers:
page.set_extra_http_headers(self.extra_headers)

res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
page.wait_for_load_state('networkidle')
Expand All @@ -123,21 +133,24 @@ def fetch(self, url: str) -> Response:
if self.network_idle:
page.wait_for_load_state('networkidle')

response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=page.content(),
body=page.content().encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
headers=final_response.all_headers(),
request_headers=final_response.request.all_headers(),
**self.adaptor_arguments
)
page.close()
Expand All @@ -151,6 +164,14 @@ async def async_fetch(self, url: str) -> Response:
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
addons = [] if self.disable_ads else [DefaultAddons.UBO]
# Store the final response
final_response = None

async def handle_response(finished_response):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

async with AsyncCamoufox(
geoip=self.geoip,
proxy=self.proxy,
Expand All @@ -167,13 +188,15 @@ async def async_fetch(self, url: str) -> Response:
page = await browser.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)
if self.disable_resources:
await page.route("**/*", async_intercept_route)

if self.extra_headers:
await page.set_extra_http_headers(self.extra_headers)

res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
await page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
await page.wait_for_load_state('networkidle')
Expand All @@ -190,21 +213,24 @@ async def async_fetch(self, url: str) -> Response:
if self.network_idle:
await page.wait_for_load_state('networkidle')

response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=await page.content(),
body=(await page.content()).encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
headers=await res.all_headers(),
request_headers=await res.request.all_headers(),
headers=await final_response.all_headers(),
request_headers=await final_response.request.all_headers(),
**self.adaptor_arguments
)
await page.close()
Expand Down
65 changes: 47 additions & 18 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json

from scrapling.core._types import Callable, Dict, Optional, Union
from scrapling.core._types import (Callable, Dict, Optional,
SelectorWaitStates, Union)
from scrapling.core.utils import log, lru_cache
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
NSTBROWSER_DEFAULT_QUERY)
Expand All @@ -23,7 +24,7 @@ def __init__(
page_action: Callable = None,
wait_selector: Optional[str] = None,
locale: Optional[str] = 'en-US',
wait_selector_state: Optional[str] = 'attached',
wait_selector_state: SelectorWaitStates = 'attached',
stealth: Optional[bool] = False,
real_chrome: Optional[bool] = False,
hide_canvas: Optional[bool] = False,
Expand Down Expand Up @@ -193,12 +194,21 @@ def fetch(self, url: str) -> Response:
:param url: Target url.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
from playwright.sync_api import Response as PlaywrightResponse
if not self.stealth or self.real_chrome:
# Because rebrowser_playwright doesn't play well with real browsers
from playwright.sync_api import sync_playwright
else:
from rebrowser_playwright.sync_api import sync_playwright

# Store the final response
final_response = None

def handle_response(finished_response: PlaywrightResponse):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

with sync_playwright() as p:
# Creating the browser
if self.cdp_url:
Expand All @@ -212,6 +222,8 @@ def fetch(self, url: str) -> Response:
page = context.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)

if self.extra_headers:
page.set_extra_http_headers(self.extra_headers)
Expand All @@ -223,7 +235,7 @@ def fetch(self, url: str) -> Response:
for script in self.__stealth_scripts():
page.add_init_script(path=script)

res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
page.wait_for_load_state('networkidle')
Expand All @@ -240,21 +252,24 @@ def fetch(self, url: str) -> Response:
if self.network_idle:
page.wait_for_load_state('networkidle')

response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=page.content(),
body=page.content().encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
headers=final_response.all_headers(),
request_headers=final_response.request.all_headers(),
**self.adaptor_arguments
)
page.close()
Expand All @@ -266,12 +281,21 @@ async def async_fetch(self, url: str) -> Response:
:param url: Target url.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
from playwright.async_api import Response as PlaywrightResponse
if not self.stealth or self.real_chrome:
# Because rebrowser_playwright doesn't play well with real browsers
from playwright.async_api import async_playwright
else:
from rebrowser_playwright.async_api import async_playwright

# Store the final response
final_response = None

async def handle_response(finished_response: PlaywrightResponse):
nonlocal final_response
if finished_response.request.resource_type == "document":
final_response = finished_response

async with async_playwright() as p:
# Creating the browser
if self.cdp_url:
Expand All @@ -285,6 +309,8 @@ async def async_fetch(self, url: str) -> Response:
page = await context.new_page()
page.set_default_navigation_timeout(self.timeout)
page.set_default_timeout(self.timeout)
# Listen for all responses
page.on("response", handle_response)

if self.extra_headers:
await page.set_extra_http_headers(self.extra_headers)
Expand All @@ -296,7 +322,7 @@ async def async_fetch(self, url: str) -> Response:
for script in self.__stealth_scripts():
await page.add_init_script(path=script)

res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
await page.wait_for_load_state(state="domcontentloaded")
if self.network_idle:
await page.wait_for_load_state('networkidle')
Expand All @@ -313,21 +339,24 @@ async def async_fetch(self, url: str) -> Response:
if self.network_idle:
await page.wait_for_load_state('networkidle')

response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
# In case we didn't catch a document type somehow
final_response = final_response if final_response else first_response
# This will be parsed inside `Response`
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
# PlayWright API sometimes give empty status text for some reason!
status_text = res.status_text or StatusText.get(res.status)
status_text = final_response.status_text or StatusText.get(final_response.status)

response = Response(
url=res.url,
url=final_response.url,
text=await page.content(),
body=(await page.content()).encode('utf-8'),
status=res.status,
body=response_bytes,
status=final_response.status,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
headers=await res.all_headers(),
request_headers=await res.request.all_headers(),
headers=await final_response.all_headers(),
request_headers=await final_response.request.all_headers(),
**self.adaptor_arguments
)
await page.close()
Expand Down
6 changes: 1 addition & 5 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') ->
class Response(Adaptor):
"""This class is returned by all engines as a way to unify response type between different libraries."""

_is_response_result_logged = False # Class-level flag, initialized to False

def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
Expand All @@ -99,9 +97,7 @@ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, c
# For back-ward compatibility
self.adaptor = self
# For easier debugging while working from a Python shell
if not Response._is_response_result_logged:
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
Response._is_response_result_logged = True
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')

# def __repr__(self):
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
Expand Down
Loading

0 comments on commit ee59914

Please sign in to comment.