Skip to content

Commit

Permalink
Merge pull request #8 from D4Vinci/dev
Browse files Browse the repository at this point in the history
v0.2.1
  • Loading branch information
D4Vinci authored Nov 15, 2024
2 parents 9e33725 + 90e38af commit 773fcd5
Show file tree
Hide file tree
Showing 14 changed files with 154 additions and 102 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: Tests
on: [push, pull_request]
on: [push]

concurrency:
group: ${{github.workflow}}-${{ github.ref }}
Expand Down
37 changes: 24 additions & 13 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion scrapling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from scrapling.core.custom_types import TextHandler, AttributesHandler

__author__ = "Karim Shoair ([email protected])"
__version__ = "0.2"
__version__ = "0.2.1"
__copyright__ = "Copyright (c) 2024 Karim Shoair"


Expand Down
14 changes: 13 additions & 1 deletion scrapling/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache

from scrapling.core._types import Dict, Iterable, Any
from scrapling.core._types import Dict, Iterable, Any, Union

import orjson
from lxml import html

html_forbidden = {html.HtmlComment, }
Expand All @@ -18,6 +19,17 @@
)


def is_jsonable(content: Union[bytes, str]) -> bool:
if type(content) is bytes:
content = content.decode()

try:
_ = orjson.loads(content)
return True
except orjson.JSONDecodeError:
return False


@cache(None, typed=True)
def setup_basic_logging(level: str = 'debug'):
levels = {
Expand Down
23 changes: 15 additions & 8 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
get_os_name,
intercept_route,
check_type_validity,
construct_proxy_dict,
generate_convincing_referer,
)

Expand All @@ -18,7 +19,8 @@ def __init__(
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
):
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
Expand All @@ -33,12 +35,14 @@ def __init__(
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
:param wait_selector: Wait for a specific css selector to be in a specific state.
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
"""
self.headless = headless
Expand All @@ -48,7 +52,9 @@ def __init__(
self.allow_webgl = bool(allow_webgl)
self.network_idle = bool(network_idle)
self.google_search = bool(google_search)
self.os_randomize = bool(os_randomize)
self.extra_headers = extra_headers or {}
self.proxy = construct_proxy_dict(proxy)
self.addons = addons or []
self.humanize = humanize
self.timeout = check_type_validity(timeout, [int, float], 30000)
Expand All @@ -66,17 +72,18 @@ def fetch(self, url: str) -> Response:
"""Opens up the browser and do your request based on your chosen options.
:param url: Target url.
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
with Camoufox(
headless=self.headless,
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
os=get_os_name(),
block_webrtc=self.block_webrtc,
allow_webgl=self.allow_webgl,
proxy=self.proxy,
addons=self.addons,
headless=self.headless,
humanize=self.humanize,
i_know_what_im_doing=True, # To turn warnings off with user configurations
i_know_what_im_doing=True, # To turn warnings off with the user configurations
allow_webgl=self.allow_webgl,
block_webrtc=self.block_webrtc,
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
os=None if self.os_randomize else get_os_name(),
) as browser:
page = browser.new_page()
page.set_default_navigation_timeout(self.timeout)
Expand Down
9 changes: 7 additions & 2 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
js_bypass_path,
intercept_route,
generate_headers,
check_type_validity,
construct_cdp_url,
check_type_validity,
construct_proxy_dict,
generate_convincing_referer,
)

Expand All @@ -33,6 +34,7 @@ def __init__(
nstbrowser_config: Optional[Dict] = None,
google_search: Optional[bool] = True,
extra_headers: Optional[Dict[str, str]] = None,
proxy: Optional[Union[str, Dict[str, str]]] = None,
adaptor_arguments: Dict = None
):
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
Expand All @@ -54,6 +56,7 @@ def __init__(
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
"""
Expand All @@ -65,6 +68,7 @@ def __init__(
self.disable_webgl = bool(disable_webgl)
self.google_search = bool(google_search)
self.extra_headers = extra_headers or {}
self.proxy = construct_proxy_dict(proxy)
self.cdp_url = cdp_url
self.useragent = useragent
self.timeout = check_type_validity(timeout, [int, float], 30000)
Expand Down Expand Up @@ -112,7 +116,7 @@ def fetch(self, url: str) -> Response:
"""Opens up the browser and do your request based on your chosen options.
:param url: Target url.
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
if not self.stealth:
from playwright.sync_api import sync_playwright
Expand Down Expand Up @@ -151,6 +155,7 @@ def fetch(self, url: str) -> Response:
locale='en-US',
is_mobile=False,
has_touch=False,
proxy=self.proxy,
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
user_agent=useragent,
device_scale_factor=2,
Expand Down
18 changes: 9 additions & 9 deletions scrapling/engines/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _prepare_response(self, response: httpxResponse) -> Response:
"""Takes httpx response and generates `Response` object from it.
:param response: httpx response object
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
return Response(
url=str(response.url),
Expand All @@ -69,9 +69,9 @@ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict)
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
return self._prepare_response(request)

Expand All @@ -81,9 +81,9 @@ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
return self._prepare_response(request)

Expand All @@ -93,9 +93,9 @@ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Di
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
return self._prepare_response(request)

Expand All @@ -105,8 +105,8 @@ def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict)
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
return self._prepare_response(request)
1 change: 1 addition & 0 deletions scrapling/engines/toolbelt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
js_bypass_path,
intercept_route,
construct_cdp_url,
construct_proxy_dict,
)
46 changes: 16 additions & 30 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,29 @@
"""
import inspect
import logging
from dataclasses import dataclass, field

from scrapling.core.utils import setup_basic_logging
from scrapling.parser import Adaptor, SQLiteStorageSystem
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable


@dataclass(frozen=True)
class Response:
class Response(Adaptor):
"""This class is returned by all engines as a way to unify response type between different libraries."""
url: str
text: str
content: bytes
status: int
reason: str
encoding: str = 'utf-8' # default encoding
cookies: Dict = field(default_factory=dict)
headers: Dict = field(default_factory=dict)
request_headers: Dict = field(default_factory=dict)
adaptor_arguments: Dict = field(default_factory=dict)

@property
def adaptor(self) -> Union[Adaptor, None]:
"""Generate Adaptor instance from this response if possible, otherwise return None"""
automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
if self.text:
# For playwright that will be the response after all JS executed
return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
elif self.content:
# For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
# To get response Bytes after the load states
# Reference: https://playwright.dev/python/docs/api/class-page
return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
return None

def __repr__(self):
return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'

def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)

self.status = status
self.reason = reason
self.cookies = cookies
self.headers = headers
self.request_headers = request_headers
# For back-ward compatibility
self.adaptor = self

# def __repr__(self):
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'


class BaseFetcher:
Expand Down
34 changes: 34 additions & 0 deletions scrapling/engines/toolbelt/navigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,40 @@ def intercept_route(route: Route) -> Union[Route, None]:
return route.continue_()


def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
"""Validate a proxy and return it in the acceptable format for Playwright
Reference: https://playwright.dev/python/docs/network#http-proxy
:param proxy_string: A string or a dictionary representation of the proxy.
:return:
"""
if proxy_string:
if isinstance(proxy_string, str):
proxy = urlparse(proxy_string)
try:
return {
'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
'username': proxy.username or '',
'password': proxy.password or '',
}
except ValueError:
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
raise TypeError(f'The proxy argument\'s string is in invalid format!')

elif isinstance(proxy_string, dict):
valid_keys = ('server', 'username', 'password', )
if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
return proxy_string
else:
raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')

else:
raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')

# The default value for proxy in Playwright's source is `None`
return None


def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
"""Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
Expand Down
Loading

0 comments on commit 773fcd5

Please sign in to comment.