From 5cb7ffe5fdc0ad8af158a1bde38e87cc49a49bef Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sat, 25 May 2024 14:04:09 -0500 Subject: [PATCH] enh: proxies (#157) * enh: proxies * enh: proxies --- README.md | 16 +- examples/JobSpy_AllSites.py | 30 ---- examples/JobSpy_Demo.ipynb | 167 ------------------- examples/JobSpy_LongScrape.py | 78 --------- pyproject.toml | 2 +- src/jobspy/__init__.py | 4 +- src/jobspy/scrapers/__init__.py | 4 +- src/jobspy/scrapers/glassdoor/__init__.py | 7 +- src/jobspy/scrapers/indeed/__init__.py | 25 ++- src/jobspy/scrapers/linkedin/__init__.py | 28 ++-- src/jobspy/scrapers/utils.py | 133 +++++++++++---- src/jobspy/scrapers/ziprecruiter/__init__.py | 9 +- 12 files changed, 149 insertions(+), 354 deletions(-) delete mode 100644 examples/JobSpy_AllSites.py delete mode 100644 examples/JobSpy_Demo.ipynb delete mode 100644 examples/JobSpy_LongScrape.py diff --git a/README.md b/README.md index 90123e3d..6347c717 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ work with us.* - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Aggregates the job postings in a Pandas DataFrame -- Proxy support +- Proxies support [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - Updated for release v1.1.3 @@ -39,7 +39,10 @@ jobs = scrape_jobs( results_wanted=20, hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) country_indeed='USA', # only needed for indeed / glassdoor + # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower) + # proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"], + ) print(f"Found {len(jobs)} jobs") print(jobs.head()) @@ -76,8 +79,9 @@ Optional ├── job_type (str): | fulltime, parttime, internship, contract │ -├── proxy (str): -| in format 'http://user:pass@host:port' +├── proxies (): +| in format ['user:pass@host:port', 'localhost'] +| each job board will round robin through the proxies │ ├── is_remote (bool) │ @@ -201,7 +205,7 @@ You can specify the following countries when searching on Indeed (use the exact ## Notes * Indeed is the best scraper currently with no rate limiting. * All the job board endpoints are capped at around 1000 jobs on a given search. -* LinkedIn is the most restrictive and usually rate limits around the 10th page. +* LinkedIn is the most restrictive and usually rate limits around the 10th page with one ip. Proxies are a must basically. ## Frequently Asked Questions @@ -216,7 +220,7 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues). **Q: Received a response code 429?** **A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend: -- Waiting some time between scrapes (site-dependent). -- Trying a VPN or proxy to change your IP address. +- Wait some time between scrapes (site-dependent). +- Try using the proxies param to change your IP address. --- diff --git a/examples/JobSpy_AllSites.py b/examples/JobSpy_AllSites.py deleted file mode 100644 index ad43c294..00000000 --- a/examples/JobSpy_AllSites.py +++ /dev/null @@ -1,30 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd - -jobs: pd.DataFrame = scrape_jobs( - site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"], - search_term="software engineer", - location="Dallas, TX", - results_wanted=25, # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho) - country_indeed="USA", - # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", -) - -# formatting for pandas -pd.set_option("display.max_columns", None) -pd.set_option("display.max_rows", None) -pd.set_option("display.width", None) -pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc - -# 1: output to console -print(jobs) - -# 2: output to .csv -jobs.to_csv("./jobs.csv", index=False) -print("outputted to jobs.csv") - -# 3: output to .xlsx -# jobs.to_xlsx('jobs.xlsx', index=False) - -# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) -# display(jobs) diff --git a/examples/JobSpy_Demo.ipynb b/examples/JobSpy_Demo.ipynb deleted file mode 100644 index 6c182f31..00000000 --- a/examples/JobSpy_Demo.ipynb +++ /dev/null @@ -1,167 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "00a94b47-f47b-420f-ba7e-714ef219c006", - "metadata": {}, - "outputs": [], - "source": [ - "from jobspy import scrape_jobs\n", - "import pandas as pd\n", - "from IPython.display import display, HTML" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f773e6c-d9fc-42cc-b0ef-63b739e78435", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.max_rows', None)\n", - "pd.set_option('display.width', None)\n", - "pd.set_option('display.max_colwidth', 50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1253c1f8-9437-492e-9dd3-e7fe51099420", - "metadata": {}, - "outputs": [], - "source": [ - "# example 1 (no hyperlinks, USA)\n", - "jobs = scrape_jobs(\n", - " site_name=[\"linkedin\"],\n", - " location='san francisco',\n", - " search_term=\"engineer\",\n", - " results_wanted=5,\n", - "\n", - " # use if you want to use a proxy\n", - " # proxy=\"socks5://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n", - " proxy=\"http://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n", - " #proxy=\"https://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n", - ")\n", - "display(jobs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a581b2d-f7da-4fac-868d-9efe143ee20a", - "metadata": {}, - "outputs": [], - "source": [ - "# example 2 - remote USA & hyperlinks\n", - "jobs = scrape_jobs(\n", - " site_name=[\"linkedin\", \"zip_recruiter\", \"indeed\"],\n", - " # location='san francisco',\n", - " search_term=\"software engineer\",\n", - " country_indeed=\"USA\",\n", - " hyperlinks=True,\n", - " is_remote=True,\n", - " results_wanted=5, \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fe8289bc-5b64-4202-9a64-7c117c83fd9a", - "metadata": {}, - "outputs": [], - "source": [ - "# use if hyperlinks=True\n", - "html = jobs.to_html(escape=False)\n", - "# change max-width: 200px to show more or less of the content\n", - "truncate_width = f'{html}'\n", - "display(HTML(truncate_width))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "951c2fe1-52ff-407d-8bb1-068049b36777", - "metadata": {}, - "outputs": [], - "source": [ - "# example 3 - with hyperlinks, international - linkedin (no zip_recruiter)\n", - "jobs = scrape_jobs(\n", - " site_name=[\"linkedin\"],\n", - " location='berlin',\n", - " search_term=\"engineer\",\n", - " hyperlinks=True,\n", - " results_wanted=5,\n", - " easy_apply=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e37a521-caef-441c-8fc2-2eb5b2e7da62", - "metadata": {}, - "outputs": [], - "source": [ - "# use if hyperlinks=True\n", - "html = jobs.to_html(escape=False)\n", - "# change max-width: 200px to show more or less of the content\n", - "truncate_width = f'{html}'\n", - "display(HTML(truncate_width))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0650e608-0b58-4bf5-ae86-68348035b16a", - "metadata": {}, - "outputs": [], - "source": [ - "# example 4 - international indeed (no zip_recruiter)\n", - "jobs = scrape_jobs(\n", - " site_name=[\"indeed\"],\n", - " search_term=\"engineer\",\n", - " country_indeed = \"China\",\n", - " hyperlinks=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40913ac8-3f8a-4d7e-ac47-afb88316432b", - "metadata": {}, - "outputs": [], - "source": [ - "# use if hyperlinks=True\n", - "html = jobs.to_html(escape=False)\n", - "# change max-width: 200px to show more or less of the content\n", - "truncate_width = f'{html}'\n", - "display(HTML(truncate_width))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/JobSpy_LongScrape.py b/examples/JobSpy_LongScrape.py deleted file mode 100644 index d0ac0f88..00000000 --- a/examples/JobSpy_LongScrape.py +++ /dev/null @@ -1,78 +0,0 @@ -from jobspy import scrape_jobs -import pandas as pd -import os -import time - -# creates csv a new filename if the jobs.csv already exists. -csv_filename = "jobs.csv" -counter = 1 -while os.path.exists(csv_filename): - csv_filename = f"jobs_{counter}.csv" - counter += 1 - -# results wanted and offset -results_wanted = 1000 -offset = 0 - -all_jobs = [] - -# max retries -max_retries = 3 - -# nuumber of results at each iteration -results_in_each_iteration = 30 - -while len(all_jobs) < results_wanted: - retry_count = 0 - while retry_count < max_retries: - print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs") - try: - jobs = scrape_jobs( - site_name=["indeed"], - search_term="software engineer", - # New York, NY - # Dallas, TX - # Los Angeles, CA - location="Los Angeles, CA", - results_wanted=min( - results_in_each_iteration, results_wanted - len(all_jobs) - ), - country_indeed="USA", - offset=offset, - # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", - ) - - # Add the scraped jobs to the list - all_jobs.extend(jobs.to_dict("records")) - - # Increment the offset for the next page of results - offset += results_in_each_iteration - - # Add a delay to avoid rate limiting (you can adjust the delay time as needed) - print(f"Scraped {len(all_jobs)} jobs") - print("Sleeping secs", 100 * (retry_count + 1)) - time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests - - break # Break out of the retry loop if successful - except Exception as e: - print(f"Error: {e}") - retry_count += 1 - print("Sleeping secs before retry", 100 * (retry_count + 1)) - time.sleep(100 * (retry_count + 1)) - if retry_count >= max_retries: - print("Max retries reached. Exiting.") - break - -# DataFrame from the collected job data -jobs_df = pd.DataFrame(all_jobs) - -# Formatting -pd.set_option("display.max_columns", None) -pd.set_option("display.max_rows", None) -pd.set_option("display.width", None) -pd.set_option("display.max_colwidth", 50) - -print(jobs_df) - -jobs_df.to_csv(csv_filename, index=False) -print(f"Outputted to {csv_filename}") diff --git a/pyproject.toml b/pyproject.toml index cb275fb2..f94ae2e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.53" +version = "1.1.54" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index a2656cb6..4ad1f743 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -30,7 +30,7 @@ def scrape_jobs( results_wanted: int = 15, country_indeed: str = "usa", hyperlinks: bool = False, - proxy: str | None = None, + proxies: list[str] | str | None = None, description_format: str = "markdown", linkedin_fetch_description: bool | None = False, linkedin_company_ids: list[int] | None = None, @@ -96,7 +96,7 @@ def get_site_type(): def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] - scraper = scraper_class(proxy=proxy) + scraper = scraper_class(proxies=proxies) scraped_data: JobResponse = scraper.scrape(scraper_input) cap_name = site.value.capitalize() site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 0ff23822..af278d74 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -39,9 +39,9 @@ class ScraperInput(BaseModel): class Scraper(ABC): - def __init__(self, site: Site, proxy: list[str] | None = None): + def __init__(self, site: Site, proxies: list[str] | None = None): + self.proxies = proxies self.site = site - self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) @abstractmethod def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 89f5a95a..b0dd7339 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -34,12 +34,12 @@ class GlassdoorScraper(Scraper): - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxies: list[str] | str | None = None): """ Initializes GlassdoorScraper with the Glassdoor job search url """ site = Site(Site.GLASSDOOR) - super().__init__(site, proxy=proxy) + super().__init__(site, proxies=proxies) self.base_url = None self.country = None @@ -59,7 +59,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.base_url = self.scraper_input.country.get_glassdoor_url() - self.session = create_session(self.proxy, is_tls=True, has_retry=True) + self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True) token = self._get_csrf_token() self.headers["gd-csrf-token"] = token if token else self.fallback_token @@ -245,7 +245,6 @@ def _get_location(self, location: str, is_remote: bool) -> (int, str): if not location or is_remote: return "11047", "STATE" # remote options url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" - session = create_session(self.proxy, has_retry=True) res = self.session.get(url, headers=self.headers) if res.status_code != 200: if res.status_code == 429: diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 58303f5c..b5d6cd6a 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -12,14 +12,13 @@ from datetime import datetime from concurrent.futures import ThreadPoolExecutor, Future -import requests - from .. import Scraper, ScraperInput, Site from ..utils import ( extract_emails_from_text, get_enum_from_job_type, markdown_converter, logger, + create_session, ) from ...jobs import ( JobPost, @@ -33,10 +32,13 @@ class IndeedScraper(Scraper): - def __init__(self, proxy: str | None = None): + def __init__(self, proxies: list[str] | str | None = None): """ Initializes IndeedScraper with the Indeed API url """ + super().__init__(Site.INDEED, proxies=proxies) + + self.session = create_session(proxies=self.proxies, is_tls=False) self.scraper_input = None self.jobs_per_page = 100 self.num_workers = 10 @@ -45,8 +47,6 @@ def __init__(self, proxy: str | None = None): self.api_country_code = None self.base_url = None self.api_url = "https://apis.indeed.com/graphql" - site = Site(Site.INDEED) - super().__init__(site, proxy=proxy) def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -90,13 +90,13 @@ def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]: jobs = [] new_cursor = None filters = self._build_filters() - search_term = self.scraper_input.search_term.replace('"', '\\"') if self.scraper_input.search_term else "" + search_term = ( + self.scraper_input.search_term.replace('"', '\\"') + if self.scraper_input.search_term + else "" + ) query = self.job_search_query.format( - what=( - f'what: "{search_term}"' - if search_term - else "" - ), + what=(f'what: "{search_term}"' if search_term else ""), location=( f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}' if self.scraper_input.location @@ -111,11 +111,10 @@ def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]: } api_headers = self.api_headers.copy() api_headers["indeed-co"] = self.api_country_code - response = requests.post( + response = self.session.post( self.api_url, headers=api_headers, json=payload, - proxies=self.proxy, timeout=10, ) if response.status_code != 200: diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 18fbb849..840b2fbd 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -10,14 +10,13 @@ import time import random import regex as re -import urllib.parse from typing import Optional from datetime import datetime from threading import Lock from bs4.element import Tag from bs4 import BeautifulSoup -from urllib.parse import urlparse, urlunparse +from urllib.parse import urlparse, urlunparse, unquote from .. import Scraper, ScraperInput, Site from ..exceptions import LinkedInException @@ -46,11 +45,19 @@ class LinkedInScraper(Scraper): band_delay = 4 jobs_per_page = 25 - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxies: list[str] | str | None = None): """ Initializes LinkedInScraper with the LinkedIn job search url """ - super().__init__(Site(Site.LINKEDIN), proxy=proxy) + super().__init__(Site.LINKEDIN, proxies=proxies) + self.session = create_session( + proxies=self.proxies, + is_tls=False, + has_retry=True, + delay=5, + clear_cookies=True, + ) + self.session.headers.update(self.headers) self.scraper_input = None self.country = "worldwide" self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') @@ -74,7 +81,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: ) while continue_search(): logger.info(f"LinkedIn search page: {page // 25 + 1}") - session = create_session(is_tls=False, has_retry=True, delay=5) params = { "keywords": scraper_input.search_term, "location": scraper_input.location, @@ -99,12 +105,9 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: params = {k: v for k, v in params.items() if v is not None} try: - response = session.get( + response = self.session.get( f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", params=params, - allow_redirects=True, - proxies=self.proxy, - headers=self.headers, timeout=10, ) if response.status_code not in range(200, 400): @@ -241,10 +244,7 @@ def _get_job_details(self, job_page_url: str) -> dict: :return: dict """ try: - session = create_session(is_tls=False, has_retry=True) - response = session.get( - job_page_url, headers=self.headers, timeout=5, proxies=self.proxy - ) + response = self.session.get(job_page_url, timeout=5) response.raise_for_status() except: return {} @@ -340,7 +340,7 @@ def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: job_url_direct_content.decode_contents().strip() ) if job_url_direct_match: - job_url_direct = urllib.parse.unquote(job_url_direct_match.group()) + job_url_direct = unquote(job_url_direct_match.group()) return job_url_direct diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 8fef4216..294d20c1 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -2,6 +2,8 @@ import re import logging +from itertools import cycle + import requests import tls_client import numpy as np @@ -21,6 +23,104 @@ logger.addHandler(console_handler) +class RotatingProxySession: + def __init__(self, proxies=None): + if isinstance(proxies, str): + self.proxy_cycle = cycle([self.format_proxy(proxies)]) + elif isinstance(proxies, list): + self.proxy_cycle = ( + cycle([self.format_proxy(proxy) for proxy in proxies]) + if proxies + else None + ) + else: + self.proxy_cycle = None + + @staticmethod + def format_proxy(proxy): + """Utility method to format a proxy string into a dictionary.""" + if proxy.startswith("http://") or proxy.startswith("https://"): + return {"http": proxy, "https": proxy} + return {"http": f"http://{proxy}", "https": f"http://{proxy}"} + + +class RequestsRotating(RotatingProxySession, requests.Session): + + def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False): + RotatingProxySession.__init__(self, proxies=proxies) + requests.Session.__init__(self) + self.clear_cookies = clear_cookies + self.allow_redirects = True + self.setup_session(has_retry, delay) + + def setup_session(self, has_retry, delay): + if has_retry: + retries = Retry( + total=3, + connect=3, + status=3, + status_forcelist=[500, 502, 503, 504, 429], + backoff_factor=delay, + ) + adapter = HTTPAdapter(max_retries=retries) + self.mount("http://", adapter) + self.mount("https://", adapter) + + def request(self, method, url, **kwargs): + if self.clear_cookies: + self.cookies.clear() + + if self.proxy_cycle: + next_proxy = next(self.proxy_cycle) + if next_proxy["http"] != "http://localhost": + self.proxies = next_proxy + else: + self.proxies = {} + return requests.Session.request(self, method, url, **kwargs) + + +class TLSRotating(RotatingProxySession, tls_client.Session): + + def __init__(self, proxies=None): + RotatingProxySession.__init__(self, proxies=proxies) + tls_client.Session.__init__(self, random_tls_extension_order=True) + + def execute_request(self, *args, **kwargs): + if self.proxy_cycle: + next_proxy = next(self.proxy_cycle) + if next_proxy["http"] != "http://localhost": + self.proxies = next_proxy + else: + self.proxies = {} + response = tls_client.Session.execute_request(self, *args, **kwargs) + return response + + +def create_session( + *, + proxies: dict | str | None = None, + is_tls: bool = True, + has_retry: bool = False, + delay: int = 1, + clear_cookies: bool = False, +) -> requests.Session: + """ + Creates a requests session with optional tls, proxy, and retry settings. + :return: A session object + """ + if is_tls: + session = TLSRotating(proxies=proxies) + else: + session = RequestsRotating( + proxies=proxies, + has_retry=has_retry, + delay=delay, + clear_cookies=clear_cookies, + ) + + return session + + def set_logger_level(verbose: int = 2): """ Adjusts the logger's level. This function allows the logging level to be changed at runtime. @@ -52,39 +152,6 @@ def extract_emails_from_text(text: str) -> list[str] | None: return email_regex.findall(text) -def create_session( - proxy: dict | None = None, - is_tls: bool = True, - has_retry: bool = False, - delay: int = 1, -) -> requests.Session: - """ - Creates a requests session with optional tls, proxy, and retry settings. - :return: A session object - """ - if is_tls: - session = tls_client.Session(random_tls_extension_order=True) - session.proxies = proxy - else: - session = requests.Session() - session.allow_redirects = True - if proxy: - session.proxies.update(proxy) - if has_retry: - retries = Retry( - total=3, - connect=3, - status=3, - status_forcelist=[500, 502, 503, 504, 429], - backoff_factor=delay, - ) - adapter = HTTPAdapter(max_retries=retries) - - session.mount("http://", adapter) - session.mount("https://", adapter) - return session - - def get_enum_from_job_type(job_type_str: str) -> JobType | None: """ Given a string, returns the corresponding JobType enum member if a match is found. diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index fbe896ff..7bf51bf5 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -36,14 +36,15 @@ class ZipRecruiterScraper(Scraper): base_url = "https://www.ziprecruiter.com" api_url = "https://api.ziprecruiter.com" - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxies: list[str] | str | None = None): """ Initializes ZipRecruiterScraper with the ZipRecruiter job search url """ + super().__init__(Site.ZIP_RECRUITER, proxies=proxies) + self.scraper_input = None - self.session = create_session(proxy) + self.session = create_session(proxies=proxies) self._get_cookies() - super().__init__(Site.ZIP_RECRUITER, proxy=proxy) self.delay = 5 self.jobs_per_page = 20 @@ -151,7 +152,7 @@ def _process_job(self, job: dict) -> JobPost | None: comp_max = int(job["compensation_max"]) if "compensation_max" in job else None comp_currency = job.get("compensation_currency") return JobPost( - id=str(job['listing_key']), + id=str(job["listing_key"]), title=title, company_name=company, location=location,