Merge pull request #14 from D4Vinci/dev

v0.2.5
D4Vinci · Nov 23, 2024 · 4cea8c9 · 4cea8c9
2 parents e9b0102 + 0e01c1c
commit 4cea8c9
Show file tree

Hide file tree

Showing 12 changed files with 215 additions and 18 deletions.
diff --git a/scrapling/__init__.py b/scrapling/__init__.py
@@ -4,7 +4,7 @@
 from scrapling.core.custom_types import TextHandler, AttributesHandler
 
 __author__ = "Karim Shoair ([email protected])"
-__version__ = "0.2.4"
+__version__ = "0.2.5"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
 
 

diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py
@@ -104,13 +104,10 @@ def fetch(self, url: str) -> Response:
 
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)
-                waiter.wait_for(state=self.wait_selector_state)
+                waiter.first.wait_for(state=self.wait_selector_state)
 
-            content_type = res.headers.get('content-type', '')
-            # Parse charset from content-type
-            encoding = 'utf-8'  # default encoding
-            if 'charset=' in content_type.lower():
-                encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
+            # This will be parsed inside `Response`
+            encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding
 
             status_text = res.status_text
             # PlayWright API sometimes give empty status text for some reason!

diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py
@@ -214,13 +214,10 @@ def fetch(self, url: str) -> Response:
 
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)
-                waiter.wait_for(state=self.wait_selector_state)
+                waiter.first.wait_for(state=self.wait_selector_state)
 
-            content_type = res.headers.get('content-type', '')
-            # Parse charset from content-type
-            encoding = 'utf-8'  # default encoding
-            if 'charset=' in content_type.lower():
-                encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
+            # This will be parsed inside `Response`
+            encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding
 
             status_text = res.status_text
             # PlayWright API sometimes give empty status text for some reason!

diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -3,11 +3,78 @@
 """
 import inspect
 import logging
+from email.message import Message
 
 from scrapling.core.custom_types import MappingProxyType
 from scrapling.parser import Adaptor, SQLiteStorageSystem
 from scrapling.core.utils import setup_basic_logging, cache
-from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
+from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
+
+
+class ResponseEncoding:
+    DEFAULT_ENCODING = "utf-8"
+    ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
+
+    @classmethod
+    @cache(maxsize=None)
+    def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
+        """Parse content type and parameters from a content-type header value.
+
+            Uses `email.message.Message` for robust header parsing according to RFC 2045.
+
+        :param header_value: Raw content-type header string
+        :return: Tuple of (content_type, parameters_dict)
+        """
+        # Create a Message object and set the Content-Type header then get the content type and parameters
+        msg = Message()
+        msg['content-type'] = header_value
+
+        content_type = msg.get_content_type()
+        params = dict(msg.get_params(failobj=[]))
+
+        # Remove the content-type from params if present somehow
+        params.pop('content-type', None)
+
+        return content_type, params
+
+    @classmethod
+    @cache(maxsize=None)
+    def get_value(cls, content_type: Optional[str]) -> str:
+        """Determine the appropriate character encoding from a content-type header.
+
+        The encoding is determined by these rules in order:
+        1. If no content-type is provided, use UTF-8
+        2. If charset parameter is present, use that encoding
+        3. If content-type is text/*, use ISO-8859-1 per HTTP/1.1 spec
+        4. If content-type is application/json, use UTF-8 per RFC 4627
+        5. Default to UTF-8 if nothing else matches
+
+        :param content_type: Content-Type header value or None
+        :return: String naming the character encoding
+        """
+        if not content_type:
+            return cls.DEFAULT_ENCODING
+
+        try:
+            content_type, params = cls.__parse_content_type(content_type)
+
+            # First check for explicit charset parameter
+            if "charset" in params:
+                encoding = params["charset"].strip("'\"")
+                "test".encode(encoding)  # Validate encoding
+                return encoding
+
+            # Apply content-type specific rules
+            if content_type in cls.ISO_8859_1_CONTENT_TYPES:
+                return "ISO-8859-1"
+
+            if content_type == "application/json":
+                return cls.DEFAULT_ENCODING
+
+            return cls.DEFAULT_ENCODING
+
+        except (ValueError, LookupError, UnicodeEncodeError):
+            return cls.DEFAULT_ENCODING
 
 
 class Response(Adaptor):
@@ -20,6 +87,7 @@ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, c
         self.cookies = cookies
         self.headers = headers
         self.request_headers = request_headers
+        encoding = ResponseEncoding.get_value(encoding)
         super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
         # For back-ward compatibility
         self.adaptor = self

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = scrapling
-version = 0.2.4
+version = 0.2.5
 author = Karim Shoair
 author_email = [email protected]
 description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="scrapling",
-    version="0.2.4",
+    version="0.2.5",
     description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It 
     simplifies the process of extracting data from websites, even when they undergo structural changes, and offers 
     impressive speed improvements over many popular scraping tools.""",

diff --git a/tests/fetchers/test_camoufox.py b/tests/fetchers/test_camoufox.py
@@ -36,6 +36,7 @@ def test_blocking_resources(self):
     def test_waiting_selector(self):
         """Test if waiting for a selector make page does not finish loading or not"""
         self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
+        self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
 
     def test_cookies_loading(self):
         """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ def test_properties(self):
         self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
         self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
         self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
+        self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
 
     def test_infinite_timeout(self):
         """Test if infinite timeout breaks the code or not"""

diff --git a/tests/fetchers/test_playwright.py b/tests/fetchers/test_playwright.py
@@ -35,6 +35,7 @@ def test_blocking_resources(self):
     def test_waiting_selector(self):
         """Test if waiting for a selector make page does not finish loading or not"""
         self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
+        self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
 
     def test_cookies_loading(self):
         """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ def test_properties(self):
         self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
         self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
         self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
+        self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
 
     def test_cdp_url(self):
         """Test if it's going to try to connect to cdp url or not"""

diff --git a/tests/fetchers/test_utils.py b/tests/fetchers/test_utils.py
@@ -0,0 +1,129 @@
+import unittest
+
+from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
+
+
+class TestPlayWrightFetcher(unittest.TestCase):
+    def setUp(self):
+        self.content_type_map = {
+            # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
+            'text/html; charset=UTF-8': 'UTF-8',
+            'text/html; charset=ISO-8859-1': 'ISO-8859-1',
+            'text/html': 'ISO-8859-1',
+            'application/json; charset=UTF-8': 'UTF-8',
+            'application/json': 'utf-8',
+            'text/json': 'utf-8',
+            'application/javascript; charset=UTF-8': 'UTF-8',
+            'application/javascript': 'utf-8',
+            'text/plain; charset=UTF-8': 'UTF-8',
+            'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
+            'text/plain': 'ISO-8859-1',
+            'application/xhtml+xml; charset=UTF-8': 'UTF-8',
+            'application/xhtml+xml': 'utf-8',
+            'text/html; charset=windows-1252': 'windows-1252',
+            'application/json; charset=windows-1252': 'windows-1252',
+            'text/plain; charset=windows-1252': 'windows-1252',
+            'text/html; charset="UTF-8"': 'UTF-8',
+            'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
+            'text/html; charset="windows-1252"': 'windows-1252',
+            'application/json; charset="UTF-8"': 'UTF-8',
+            'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
+            'application/json; charset="windows-1252"': 'windows-1252',
+            'text/json; charset="UTF-8"': 'UTF-8',
+            'application/javascript; charset="UTF-8"': 'UTF-8',
+            'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
+            'text/plain; charset="UTF-8"': 'UTF-8',
+            'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
+            'text/plain; charset="windows-1252"': 'windows-1252',
+            'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
+            'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
+            'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
+            'text/html; charset="US-ASCII"': 'US-ASCII',
+            'application/json; charset="US-ASCII"': 'US-ASCII',
+            'text/plain; charset="US-ASCII"': 'US-ASCII',
+            'text/html; charset="Shift_JIS"': 'Shift_JIS',
+            'application/json; charset="Shift_JIS"': 'Shift_JIS',
+            'text/plain; charset="Shift_JIS"': 'Shift_JIS',
+            'application/xml; charset="UTF-8"': 'UTF-8',
+            'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
+            'application/xml': 'utf-8',
+            'text/xml; charset="UTF-8"': 'UTF-8',
+            'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
+            'text/xml': 'utf-8'
+        }
+        self.status_map = {
+            100: "Continue",
+            101: "Switching Protocols",
+            102: "Processing",
+            103: "Early Hints",
+            200: "OK",
+            201: "Created",
+            202: "Accepted",
+            203: "Non-Authoritative Information",
+            204: "No Content",
+            205: "Reset Content",
+            206: "Partial Content",
+            207: "Multi-Status",
+            208: "Already Reported",
+            226: "IM Used",
+            300: "Multiple Choices",
+            301: "Moved Permanently",
+            302: "Found",
+            303: "See Other",
+            304: "Not Modified",
+            305: "Use Proxy",
+            307: "Temporary Redirect",
+            308: "Permanent Redirect",
+            400: "Bad Request",
+            401: "Unauthorized",
+            402: "Payment Required",
+            403: "Forbidden",
+            404: "Not Found",
+            405: "Method Not Allowed",
+            406: "Not Acceptable",
+            407: "Proxy Authentication Required",
+            408: "Request Timeout",
+            409: "Conflict",
+            410: "Gone",
+            411: "Length Required",
+            412: "Precondition Failed",
+            413: "Payload Too Large",
+            414: "URI Too Long",
+            415: "Unsupported Media Type",
+            416: "Range Not Satisfiable",
+            417: "Expectation Failed",
+            418: "I'm a teapot",
+            421: "Misdirected Request",
+            422: "Unprocessable Entity",
+            423: "Locked",
+            424: "Failed Dependency",
+            425: "Too Early",
+            426: "Upgrade Required",
+            428: "Precondition Required",
+            429: "Too Many Requests",
+            431: "Request Header Fields Too Large",
+            451: "Unavailable For Legal Reasons",
+            500: "Internal Server Error",
+            501: "Not Implemented",
+            502: "Bad Gateway",
+            503: "Service Unavailable",
+            504: "Gateway Timeout",
+            505: "HTTP Version Not Supported",
+            506: "Variant Also Negotiates",
+            507: "Insufficient Storage",
+            508: "Loop Detected",
+            510: "Not Extended",
+            511: "Network Authentication Required"
+        }
+
+    def test_parsing_content_type(self):
+        """Test if parsing different types of content-type returns the expected result"""
+        for header_value, expected_encoding in self.content_type_map.items():
+            self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
+
+    def test_parsing_response_status(self):
+        """Test if using different http responses' status codes returns the expected result"""
+        for status_code, expected_status_text in self.status_map.items():
+            self.assertEqual(StatusText.get(status_code), expected_status_text)
+
+        self.assertEqual(StatusText.get(1000), "Unknown Status Code")
diff --git a/tests/parser/test_general.py b/tests/parser/test_general.py
@@ -278,7 +278,7 @@ def test_performance(self):
         self.assertEqual(len(elements), 5000)
         # Converting 5000 elements to a class and doing operations on them will take time
         # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
-        self.assertLess(end_time - start_time, 0.1)
+        self.assertLess(end_time - start_time, 0.5)  # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
 
 
 # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report

diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -5,3 +5,4 @@ camoufox
 werkzeug<3.0.0
 pytest-httpbin==2.1.0
 httpbin~=0.10.0
+pytest-xdist
diff --git a/tox.ini b/tox.ini
@@ -12,9 +12,10 @@ changedir = tests
 deps =
     -r{toxinidir}/tests/requirements.txt
 commands =
+    playwright install chromium
     playwright install-deps chromium firefox
     camoufox fetch --browserforge
-    pytest --cov=scrapling --cov-report=xml
+    pytest --cov=scrapling --cov-report=xml -n auto
 
 [testenv:pre-commit]
 basepython = python3