Merge pull request #11 from D4Vinci/dev

v0.2.4
D4Vinci · Nov 20, 2024 · e9b0102 · e9b0102
2 parents 1473803 + 17d7934
commit e9b0102
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 8 deletions.
diff --git a/scrapling/__init__.py b/scrapling/__init__.py
@@ -4,7 +4,7 @@
 from scrapling.core.custom_types import TextHandler, AttributesHandler
 
 __author__ = "Karim Shoair ([email protected])"
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
 
 

diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py
@@ -4,6 +4,7 @@
 from scrapling.engines.toolbelt import (
     Response,
     do_nothing,
+    StatusText,
     get_os_name,
     intercept_route,
     check_type_validity,
@@ -111,12 +112,17 @@ def fetch(self, url: str) -> Response:
             if 'charset=' in content_type.lower():
                 encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
 
+            status_text = res.status_text
+            # PlayWright API sometimes give empty status text for some reason!
+            if not status_text:
+                status_text = StatusText.get(res.status)
+
             response = Response(
                 url=res.url,
                 text=page.content(),
-                body=res.body(),
+                body=page.content().encode('utf-8'),
                 status=res.status,
-                reason=res.status_text,
+                reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),

diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py
@@ -6,6 +6,7 @@
 from scrapling.engines.toolbelt import (
     Response,
     do_nothing,
+    StatusText,
     js_bypass_path,
     intercept_route,
     generate_headers,
@@ -221,12 +222,17 @@ def fetch(self, url: str) -> Response:
             if 'charset=' in content_type.lower():
                 encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
 
+            status_text = res.status_text
+            # PlayWright API sometimes give empty status text for some reason!
+            if not status_text:
+                status_text = StatusText.get(res.status)
+
             response = Response(
                 url=res.url,
                 text=page.content(),
-                body=res.body(),
+                body=page.content().encode('utf-8'),
                 status=res.status,
-                reason=res.status_text,
+                reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),

diff --git a/scrapling/engines/toolbelt/__init__.py b/scrapling/engines/toolbelt/__init__.py
@@ -6,6 +6,7 @@
 from .custom import (
     Response,
     do_nothing,
+    StatusText,
     BaseFetcher,
     get_variable_name,
     check_type_validity,

diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -4,8 +4,9 @@
 import inspect
 import logging
 
-from scrapling.core.utils import setup_basic_logging
+from scrapling.core.custom_types import MappingProxyType
 from scrapling.parser import Adaptor, SQLiteStorageSystem
+from scrapling.core.utils import setup_basic_logging, cache
 from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
 
 
@@ -67,6 +68,83 @@ def __init__(
                 self.adaptor_arguments.update({'automatch_domain': automatch_domain})
 
 
+class StatusText:
+    """A class that gets the status text of response status code.
+
+        Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
+    """
+    _phrases = MappingProxyType({
+        100: "Continue",
+        101: "Switching Protocols",
+        102: "Processing",
+        103: "Early Hints",
+        200: "OK",
+        201: "Created",
+        202: "Accepted",
+        203: "Non-Authoritative Information",
+        204: "No Content",
+        205: "Reset Content",
+        206: "Partial Content",
+        207: "Multi-Status",
+        208: "Already Reported",
+        226: "IM Used",
+        300: "Multiple Choices",
+        301: "Moved Permanently",
+        302: "Found",
+        303: "See Other",
+        304: "Not Modified",
+        305: "Use Proxy",
+        307: "Temporary Redirect",
+        308: "Permanent Redirect",
+        400: "Bad Request",
+        401: "Unauthorized",
+        402: "Payment Required",
+        403: "Forbidden",
+        404: "Not Found",
+        405: "Method Not Allowed",
+        406: "Not Acceptable",
+        407: "Proxy Authentication Required",
+        408: "Request Timeout",
+        409: "Conflict",
+        410: "Gone",
+        411: "Length Required",
+        412: "Precondition Failed",
+        413: "Payload Too Large",
+        414: "URI Too Long",
+        415: "Unsupported Media Type",
+        416: "Range Not Satisfiable",
+        417: "Expectation Failed",
+        418: "I'm a teapot",
+        421: "Misdirected Request",
+        422: "Unprocessable Entity",
+        423: "Locked",
+        424: "Failed Dependency",
+        425: "Too Early",
+        426: "Upgrade Required",
+        428: "Precondition Required",
+        429: "Too Many Requests",
+        431: "Request Header Fields Too Large",
+        451: "Unavailable For Legal Reasons",
+        500: "Internal Server Error",
+        501: "Not Implemented",
+        502: "Bad Gateway",
+        503: "Service Unavailable",
+        504: "Gateway Timeout",
+        505: "HTTP Version Not Supported",
+        506: "Variant Also Negotiates",
+        507: "Insufficient Storage",
+        508: "Loop Detected",
+        510: "Not Extended",
+        511: "Network Authentication Required"
+    })
+
+    @classmethod
+    @cache(maxsize=128)
+    def get(cls, status_code: int) -> str:
+        """Get the phrase for a given HTTP status code."""
+        return cls._phrases.get(status_code, "Unknown Status Code")
+
+
 def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
     """This function check if the passed engine can be used by a Fetcher-type class or not.
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = scrapling
-version = 0.2.3
+version = 0.2.4
 author = Karim Shoair
 author_email = [email protected]
 description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="scrapling",
-    version="0.2.3",
+    version="0.2.4",
     description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It 
     simplifies the process of extracting data from websites, even when they undergo structural changes, and offers 
     impressive speed improvements over many popular scraping tools.""",