Skip to content

Commit

Permalink
readd proxy support for zip (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Oct 29, 2023
1 parent b303b3f commit e3fc222
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 136 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.21"
version = "1.1.22"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
2 changes: 1 addition & 1 deletion src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def scrape_page(
self.country = scraper_input.country
domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com"
session = create_session(self.proxy)

params = {
"q": scraper_input.search_term,
Expand All @@ -78,6 +77,7 @@ def scrape_page(
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
try:
session = create_session(self.proxy, is_tls=True)
response = session.get(
f"{self.url}/jobs",
headers=self.get_headers(),
Expand Down
31 changes: 19 additions & 12 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re

import requests
import tls_client
from ..jobs import JobType

Expand All @@ -24,23 +26,28 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)


def create_session(proxy: str | None = None):
def create_session(proxy: dict | None = None, is_tls: bool = True):
"""
Creates a tls client session
:return: A session object with or without proxies.
"""
session = tls_client.Session(
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
if is_tls:
session = tls_client.Session(
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
session.proxies.update(proxy)

return session

Expand Down
133 changes: 11 additions & 122 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,14 @@
import re
from datetime import datetime, date
from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from concurrent.futures import ThreadPoolExecutor

from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
Country,
)
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType


class ZipRecruiterScraper(Scraper):
Expand All @@ -42,21 +31,22 @@ def __init__(self, proxy: Optional[str] = None):
self.jobs_per_page = 20
self.seen_urls = set()

def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]:
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:param continue_token:
:return: jobs found on page
"""
params = self.add_params(scraper_input)
if continue_token:
params['continue'] = continue_token
try:
response = requests.get(
session = create_session(self.proxy, is_tls=False)
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(),
params=self.add_params(scraper_input),
allow_redirects=True,
timeout=10,
)
if response.status_code != 200:
Expand All @@ -73,7 +63,7 @@ def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optiona
jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get('continue', None)

with ThreadPoolExecutor(max_workers=10) as executor:
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [
executor.submit(self.process_job, job)
for job in jobs_list
Expand Down Expand Up @@ -109,12 +99,12 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:

return JobResponse(jobs=job_list)

def process_job(self, job: dict) -> JobPost:
"""the most common type of jobs page on ZR"""
@staticmethod
def process_job(job: dict) -> JobPost:
""" Processes an individual job dict from the response """
title = job.get("name")
job_url = job.get("job_url")


description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser"
).get_text()
Expand Down Expand Up @@ -144,7 +134,7 @@ def process_job(self, job: dict) -> JobPost:
location=location,
job_type=job_type,
compensation=Compensation(
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") ,
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
currency=job.get("compensation_currency"),
Expand Down Expand Up @@ -192,107 +182,6 @@ def add_params(scraper_input) -> dict[str, str | Any]:

return params

@staticmethod
def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()

if interval_str in interval_alias:
return interval_alias[interval_str]

return CompensationInterval(interval_str)

@staticmethod
def get_date_posted(job: Tag) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
if not button:
return None

url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
posted_time_str = params.get("posted_time", [None])[0]

if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date

return None

@staticmethod
def get_compensation(job: Tag) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()

def create_compensation_object(pay_string: str) -> Compensation:
"""
Creates a Compensation object from a pay_string
:param pay_string
:return: compensation
"""
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])

amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = int(float(amount)) * 1000
else:
amount = int(float(amount))
amounts.append(amount)

compensation = Compensation(
interval=interval,
min_amount=min(amounts),
max_amount=max(amounts),
currency="USD/CAD",
)

return compensation

return create_compensation_object(pay)

@staticmethod
def get_location(job: Tag) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_link = job.find("a", {"class": "company_location"})
if location_link is not None:
location_string = location_link.text.strip()
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
else:
city, state = None, None
else:
city, state = None, None
return Location(city=city, state=state, country=Country.US_CANADA)

@staticmethod
def headers() -> dict:
"""
Expand Down

0 comments on commit e3fc222

Please sign in to comment.