From e3fc222eb53c8a90d32a4ec248274de1334bd8f9 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 29 Oct 2023 08:54:56 -0500 Subject: [PATCH] readd proxy support for zip (#64) --- pyproject.toml | 2 +- src/jobspy/scrapers/indeed/__init__.py | 2 +- src/jobspy/scrapers/utils.py | 31 +++-- src/jobspy/scrapers/ziprecruiter/__init__.py | 133 ++----------------- 4 files changed, 32 insertions(+), 136 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1d102ab..ce7d32b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.21" +version = "1.1.22" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 3800221..297cfe1 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -58,7 +58,6 @@ def scrape_page( self.country = scraper_input.country domain = self.country.domain_value self.url = f"https://{domain}.indeed.com" - session = create_session(self.proxy) params = { "q": scraper_input.search_term, @@ -78,6 +77,7 @@ def scrape_page( if sc_values: params["sc"] = "0kf:" + "".join(sc_values) + ";" try: + session = create_session(self.proxy, is_tls=True) response = session.get( f"{self.url}/jobs", headers=self.get_headers(), diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index f34a48d..77698ba 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,4 +1,6 @@ import re + +import requests import tls_client from ..jobs import JobType @@ -24,23 +26,28 @@ def extract_emails_from_text(text: str) -> list[str] | None: return email_regex.findall(text) -def create_session(proxy: str | None = None): +def create_session(proxy: dict | None = None, is_tls: bool = True): """ Creates a tls client session :return: A session object with or without proxies. """ - session = tls_client.Session( - client_identifier="chrome112", - random_tls_extension_order=True, - ) - session.proxies = proxy - # TODO multiple proxies - # if self.proxies: - # session.proxies = { - # "http": random.choice(self.proxies), - # "https": random.choice(self.proxies), - # } + if is_tls: + session = tls_client.Session( + client_identifier="chrome112", + random_tls_extension_order=True, + ) + session.proxies = proxy + # TODO multiple proxies + # if self.proxies: + # session.proxies = { + # "http": random.choice(self.proxies), + # "https": random.choice(self.proxies), + # } + else: + session = requests.Session() + session.allow_redirects = True + session.proxies.update(proxy) return session diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 298e8df..b999746 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -9,25 +9,14 @@ import re from datetime import datetime, date from typing import Optional, Tuple, Any -from urllib.parse import urlparse, parse_qs, urlunparse -import requests from bs4 import BeautifulSoup -from bs4.element import Tag -from concurrent.futures import ThreadPoolExecutor, Future +from concurrent.futures import ThreadPoolExecutor from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException from ..utils import count_urgent_words, extract_emails_from_text, create_session -from ...jobs import ( - JobPost, - Compensation, - CompensationInterval, - Location, - JobResponse, - JobType, - Country, -) +from ...jobs import JobPost, Compensation, Location, JobResponse, JobType class ZipRecruiterScraper(Scraper): @@ -42,21 +31,22 @@ def __init__(self, proxy: Optional[str] = None): self.jobs_per_page = 20 self.seen_urls = set() - def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]: + def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]: """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria :param scraper_input: + :param continue_token: :return: jobs found on page """ params = self.add_params(scraper_input) if continue_token: params['continue'] = continue_token try: - response = requests.get( + session = create_session(self.proxy, is_tls=False) + response = session.get( f"https://api.ziprecruiter.com/jobs-app/jobs", headers=self.headers(), params=self.add_params(scraper_input), - allow_redirects=True, timeout=10, ) if response.status_code != 200: @@ -73,7 +63,7 @@ def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optiona jobs_list = response_data.get("jobs", []) next_continue_token = response_data.get('continue', None) - with ThreadPoolExecutor(max_workers=10) as executor: + with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: job_results = [ executor.submit(self.process_job, job) for job in jobs_list @@ -109,12 +99,12 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: return JobResponse(jobs=job_list) - def process_job(self, job: dict) -> JobPost: - """the most common type of jobs page on ZR""" + @staticmethod + def process_job(job: dict) -> JobPost: + """ Processes an individual job dict from the response """ title = job.get("name") job_url = job.get("job_url") - description = BeautifulSoup( job.get("job_description", "").strip(), "html.parser" ).get_text() @@ -144,7 +134,7 @@ def process_job(self, job: dict) -> JobPost: location=location, job_type=job_type, compensation=Compensation( - interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") , + interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"), min_amount=int(job["compensation_min"]) if "compensation_min" in job else None, max_amount=int(job["compensation_max"]) if "compensation_max" in job else None, currency=job.get("compensation_currency"), @@ -192,107 +182,6 @@ def add_params(scraper_input) -> dict[str, str | Any]: return params - @staticmethod - def get_interval(interval_str: str): - """ - Maps the interval alias to its appropriate CompensationInterval. - :param interval_str - :return: CompensationInterval - """ - interval_alias = {"annually": CompensationInterval.YEARLY} - interval_str = interval_str.lower() - - if interval_str in interval_alias: - return interval_alias[interval_str] - - return CompensationInterval(interval_str) - - @staticmethod - def get_date_posted(job: Tag) -> Optional[datetime.date]: - """ - Extracts the date a job was posted - :param job - :return: date the job was posted or None - """ - button = job.find( - "button", {"class": "action_input save_job zrs_btn_secondary_200"} - ) - if not button: - return None - - url_time = button.get("data-href", "") - url_components = urlparse(url_time) - params = parse_qs(url_components.query) - posted_time_str = params.get("posted_time", [None])[0] - - if posted_time_str: - posted_date = datetime.strptime( - posted_time_str, "%Y-%m-%dT%H:%M:%SZ" - ).date() - return posted_date - - return None - - @staticmethod - def get_compensation(job: Tag) -> Optional[Compensation]: - """ - Parses the compensation tag from the job BeautifulSoup object - :param job - :return: Compensation object or None - """ - pay_element = job.find("li", {"class": "perk_item perk_pay"}) - if pay_element is None: - return None - pay = pay_element.find("div", {"class": "value"}).find("span").text.strip() - - def create_compensation_object(pay_string: str) -> Compensation: - """ - Creates a Compensation object from a pay_string - :param pay_string - :return: compensation - """ - interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1]) - - amounts = [] - for amount in pay_string.split("to"): - amount = amount.replace(",", "").strip("$ ").split(" ")[0] - if "K" in amount: - amount = amount.replace("K", "") - amount = int(float(amount)) * 1000 - else: - amount = int(float(amount)) - amounts.append(amount) - - compensation = Compensation( - interval=interval, - min_amount=min(amounts), - max_amount=max(amounts), - currency="USD/CAD", - ) - - return compensation - - return create_compensation_object(pay) - - @staticmethod - def get_location(job: Tag) -> Location: - """ - Extracts the job location from BeatifulSoup object - :param job: - :return: location - """ - location_link = job.find("a", {"class": "company_location"}) - if location_link is not None: - location_string = location_link.text.strip() - parts = location_string.split(", ") - if len(parts) == 2: - city, state = parts - else: - city, state = None, None - else: - city, state = None, None - return Location(city=city, state=state, country=Country.US_CANADA) - @staticmethod def headers() -> dict: """