Merge pull request #8 from D4Vinci/dev

v0.2.1
D4Vinci · Nov 15, 2024 · 773fcd5 · 773fcd5
2 parents 9e33725 + 90e38af
commit 773fcd5
Show file tree

Hide file tree

Showing 14 changed files with 154 additions and 102 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,5 +1,5 @@
 name: Tests
-on: [push, pull_request]
+on: [push]
 
 concurrency:
   group: ${{github.workflow}}-${{ github.ref }}

diff --git a/README.md b/README.md
diff --git a/scrapling/__init__.py b/scrapling/__init__.py
@@ -4,7 +4,7 @@
 from scrapling.core.custom_types import TextHandler, AttributesHandler
 
 __author__ = "Karim Shoair ([email protected])"
-__version__ = "0.2"
+__version__ = "0.2.1"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
 
 

diff --git a/scrapling/core/utils.py b/scrapling/core/utils.py
@@ -4,8 +4,9 @@
 # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
 from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
 
-from scrapling.core._types import Dict, Iterable, Any
+from scrapling.core._types import Dict, Iterable, Any, Union
 
+import orjson
 from lxml import html
 
 html_forbidden = {html.HtmlComment, }
@@ -18,6 +19,17 @@
     )
 
 
+def is_jsonable(content: Union[bytes, str]) -> bool:
+    if type(content) is bytes:
+        content = content.decode()
+
+    try:
+        _ = orjson.loads(content)
+        return True
+    except orjson.JSONDecodeError:
+        return False
+
+
 @cache(None, typed=True)
 def setup_basic_logging(level: str = 'debug'):
     levels = {

diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py
@@ -7,6 +7,7 @@
     get_os_name,
     intercept_route,
     check_type_validity,
+    construct_proxy_dict,
     generate_convincing_referer,
 )
 
@@ -18,7 +19,8 @@ def __init__(
             self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
             timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
-            wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
+            wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
     ):
         """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
 
@@ -33,12 +35,14 @@ def __init__(
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
         self.headless = headless
@@ -48,7 +52,9 @@ def __init__(
         self.allow_webgl = bool(allow_webgl)
         self.network_idle = bool(network_idle)
         self.google_search = bool(google_search)
+        self.os_randomize = bool(os_randomize)
         self.extra_headers = extra_headers or {}
+        self.proxy = construct_proxy_dict(proxy)
         self.addons = addons or []
         self.humanize = humanize
         self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -66,17 +72,18 @@ def fetch(self, url: str) -> Response:
         """Opens up the browser and do your request based on your chosen options.
 
         :param url: Target url.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         with Camoufox(
-                headless=self.headless,
-                block_images=self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
-                os=get_os_name(),
-                block_webrtc=self.block_webrtc,
-                allow_webgl=self.allow_webgl,
+                proxy=self.proxy,
                 addons=self.addons,
+                headless=self.headless,
                 humanize=self.humanize,
-                i_know_what_im_doing=True,  # To turn warnings off with user configurations
+                i_know_what_im_doing=True,  # To turn warnings off with the user configurations
+                allow_webgl=self.allow_webgl,
+                block_webrtc=self.block_webrtc,
+                block_images=self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
+                os=None if self.os_randomize else get_os_name(),
         ) as browser:
             page = browser.new_page()
             page.set_default_navigation_timeout(self.timeout)

diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py
@@ -9,8 +9,9 @@
     js_bypass_path,
     intercept_route,
     generate_headers,
-    check_type_validity,
     construct_cdp_url,
+    check_type_validity,
+    construct_proxy_dict,
     generate_convincing_referer,
 )
 
@@ -33,6 +34,7 @@ def __init__(
             nstbrowser_config: Optional[Dict] = None,
             google_search: Optional[bool] = True,
             extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None,
             adaptor_arguments: Dict = None
     ):
         """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
@@ -54,6 +56,7 @@ def __init__(
         :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
@@ -65,6 +68,7 @@ def __init__(
         self.disable_webgl = bool(disable_webgl)
         self.google_search = bool(google_search)
         self.extra_headers = extra_headers or {}
+        self.proxy = construct_proxy_dict(proxy)
         self.cdp_url = cdp_url
         self.useragent = useragent
         self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -112,7 +116,7 @@ def fetch(self, url: str) -> Response:
         """Opens up the browser and do your request based on your chosen options.
 
         :param url: Target url.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         if not self.stealth:
             from playwright.sync_api import sync_playwright
@@ -151,6 +155,7 @@ def fetch(self, url: str) -> Response:
                     locale='en-US',
                     is_mobile=False,
                     has_touch=False,
+                    proxy=self.proxy,
                     color_scheme='dark',  # Bypasses the 'prefersLightColor' check in creepjs
                     user_agent=useragent,
                     device_scale_factor=2,

diff --git a/scrapling/engines/static.py b/scrapling/engines/static.py
@@ -48,7 +48,7 @@ def _prepare_response(self, response: httpxResponse) -> Response:
         """Takes httpx response and generates `Response` object from it.
 
         :param response: httpx response object
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         return Response(
             url=str(response.url),
@@ -69,9 +69,9 @@ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict)
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
 
@@ -81,9 +81,9 @@ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
 
@@ -93,9 +93,9 @@ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Di
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
 
@@ -105,8 +105,8 @@ def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict)
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
diff --git a/scrapling/engines/toolbelt/__init__.py b/scrapling/engines/toolbelt/__init__.py
@@ -15,4 +15,5 @@
     js_bypass_path,
     intercept_route,
     construct_cdp_url,
+    construct_proxy_dict,
 )
diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -3,43 +3,29 @@
 """
 import inspect
 import logging
-from dataclasses import dataclass, field
 
 from scrapling.core.utils import setup_basic_logging
 from scrapling.parser import Adaptor, SQLiteStorageSystem
 from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
 
 
-@dataclass(frozen=True)
-class Response:
+class Response(Adaptor):
     """This class is returned by all engines as a way to unify response type between different libraries."""
-    url: str
-    text: str
-    content: bytes
-    status: int
-    reason: str
-    encoding: str = 'utf-8'  # default encoding
-    cookies: Dict = field(default_factory=dict)
-    headers: Dict = field(default_factory=dict)
-    request_headers: Dict = field(default_factory=dict)
-    adaptor_arguments: Dict = field(default_factory=dict)
-
-    @property
-    def adaptor(self) -> Union[Adaptor, None]:
-        """Generate Adaptor instance from this response if possible, otherwise return None"""
-        automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
-        if self.text:
-            # For playwright that will be the response after all JS executed
-            return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
-        elif self.content:
-            # For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
-            # To get response Bytes after the load states
-            # Reference: https://playwright.dev/python/docs/api/class-page
-            return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
-        return None
-
-    def __repr__(self):
-        return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
+
+    def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
+        automatch_domain = adaptor_arguments.pop('automatch_domain', None)
+        super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
+
+        self.status = status
+        self.reason = reason
+        self.cookies = cookies
+        self.headers = headers
+        self.request_headers = request_headers
+        # For back-ward compatibility
+        self.adaptor = self
+
+    # def __repr__(self):
+    #     return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
 
 
 class BaseFetcher:

diff --git a/scrapling/engines/toolbelt/navigation.py b/scrapling/engines/toolbelt/navigation.py
@@ -25,6 +25,40 @@ def intercept_route(route: Route) -> Union[Route, None]:
     return route.continue_()
 
 
+def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
+    """Validate a proxy and return it in the acceptable format for Playwright
+    Reference: https://playwright.dev/python/docs/network#http-proxy
+
+    :param proxy_string: A string or a dictionary representation of the proxy.
+    :return:
+    """
+    if proxy_string:
+        if isinstance(proxy_string, str):
+            proxy = urlparse(proxy_string)
+            try:
+                return {
+                    'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
+                    'username': proxy.username or '',
+                    'password': proxy.password or '',
+                }
+            except ValueError:
+                # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
+                raise TypeError(f'The proxy argument\'s string is in invalid format!')
+
+        elif isinstance(proxy_string, dict):
+            valid_keys = ('server', 'username', 'password', )
+            if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
+                return proxy_string
+            else:
+                raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
+
+        else:
+            raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
+
+    # The default value for proxy in Playwright's source is `None`
+    return None
+
+
 def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
     """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists