Skip to content

Commit

Permalink
Merge pull request #15 from D4Vinci/docs
Browse files Browse the repository at this point in the history
Doc adjustments to use Sphinx soon
  • Loading branch information
D4Vinci authored Nov 23, 2024
2 parents 4cea8c9 + 1929d9b commit e94c503
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 27 deletions.
5 changes: 2 additions & 3 deletions scrapling/core/custom_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,8 @@ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entiti


class AttributesHandler(Mapping):
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
at the same time I use it to add more functionalities.
If standard dictionary is needed, just convert this class to dictionary with `dict` function
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
If standard dictionary is needed, just convert this class to dictionary with `dict` function
"""
__slots__ = ('_data',)

Expand Down
10 changes: 6 additions & 4 deletions scrapling/core/translator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""
Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
which will be important in future releases but most importantly...
so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
> if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
"""

import re
Expand Down
6 changes: 5 additions & 1 deletion scrapling/engines/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, f
@staticmethod
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
:param headers: Current headers in the request if the user passed any
:param url: The Target URL.
Expand Down Expand Up @@ -65,6 +65,7 @@ def _prepare_response(self, response: httpxResponse) -> Response:

def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP GET request for you but with some added flavors.
:param url: Target url.
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
Expand All @@ -77,6 +78,7 @@ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict)

def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP POST request for you but with some added flavors.
:param url: Target url.
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
Expand All @@ -89,6 +91,7 @@ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict

def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP DELETE request for you but with some added flavors.
:param url: Target url.
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
Expand All @@ -101,6 +104,7 @@ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Di

def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP PUT request for you but with some added flavors.
:param url: Target url.
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request had came from Google's search of this URL's domain.
Expand Down
24 changes: 12 additions & 12 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@


class ResponseEncoding:
DEFAULT_ENCODING = "utf-8"
ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
__DEFAULT_ENCODING = "utf-8"
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}

@classmethod
@cache(maxsize=None)
Expand Down Expand Up @@ -43,17 +43,17 @@ def get_value(cls, content_type: Optional[str]) -> str:
"""Determine the appropriate character encoding from a content-type header.
The encoding is determined by these rules in order:
1. If no content-type is provided, use UTF-8
2. If charset parameter is present, use that encoding
3. If content-type is text/*, use ISO-8859-1 per HTTP/1.1 spec
4. If content-type is application/json, use UTF-8 per RFC 4627
5. Default to UTF-8 if nothing else matches
1. If no content-type is provided, use UTF-8
2. If charset parameter is present, use that encoding
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
4. If content-type is application/json, use UTF-8 per RFC 4627
5. Default to UTF-8 if nothing else matches
:param content_type: Content-Type header value or None
:return: String naming the character encoding
"""
if not content_type:
return cls.DEFAULT_ENCODING
return cls.__DEFAULT_ENCODING

try:
content_type, params = cls.__parse_content_type(content_type)
Expand All @@ -65,16 +65,16 @@ def get_value(cls, content_type: Optional[str]) -> str:
return encoding

# Apply content-type specific rules
if content_type in cls.ISO_8859_1_CONTENT_TYPES:
if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
return "ISO-8859-1"

if content_type == "application/json":
return cls.DEFAULT_ENCODING
return cls.__DEFAULT_ENCODING

return cls.DEFAULT_ENCODING
return cls.__DEFAULT_ENCODING

except (ValueError, LookupError, UnicodeEncodeError):
return cls.DEFAULT_ENCODING
return cls.__DEFAULT_ENCODING


class Response(Adaptor):
Expand Down
22 changes: 15 additions & 7 deletions scrapling/fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
"""
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP GET request for you but with some added flavors.
:param url: Target url.
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
Expand All @@ -24,6 +25,7 @@ def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[i

def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP POST request for you but with some added flavors.
:param url: Target url.
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
Expand All @@ -37,19 +39,22 @@ def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[

def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP PUT request for you but with some added flavors.
:param url: Target url
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
create a referer header as if this request came from Google's search of this URL's domain.
create a referer header as if this request came from Google's search of this URL's domain.
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
"""
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
return response_object

def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
"""Make basic HTTP DELETE request for you but with some added flavors.
:param url: Target url
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
Expand Down Expand Up @@ -77,6 +82,7 @@ def fetch(
) -> Response:
"""
Opens up a browser and do your request based on your chosen options below.
:param url: Target url.
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
:param block_images: Prevent the loading of images through Firefox preferences.
Expand Down Expand Up @@ -127,14 +133,15 @@ class PlayWrightFetcher(BaseFetcher):
Using this Fetcher class, you can do requests with:
- Vanilla Playwright without any modifications other than the ones you chose.
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
Some of the things stealth mode does include:
1) Patches the CDP runtime fingerprint.
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
3) Using custom flags on launch to hide Playwright even more and make it faster.
4) Generates real browser's headers of the same type and same user OS then append it to the request.
Some of the things stealth mode does include:
1) Patches the CDP runtime fingerprint.
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
3) Using custom flags on launch to hide Playwright even more and make it faster.
4) Generates real browser's headers of the same type and same user OS then append it to the request.
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
> Note that these are the main options with PlayWright but it can be mixed together.
> Note that these are the main options with PlayWright but it can be mixed together.
"""
def fetch(
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
Expand All @@ -147,6 +154,7 @@ def fetch(
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
) -> Response:
"""Opens up a browser and do your request based on your chosen options below.
:param url: Target url.
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
Expand Down

0 comments on commit e94c503

Please sign in to comment.