Skip to content

Commit

Permalink
enh: proxies (#157)
Browse files Browse the repository at this point in the history
* enh: proxies

* enh: proxies
  • Loading branch information
cullenwatson authored May 25, 2024
1 parent cd29f79 commit 5cb7ffe
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 354 deletions.
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ work with us.*

- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
- Proxy support
- Proxies support

[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3
Expand Down Expand Up @@ -39,7 +39,10 @@ jobs = scrape_jobs(
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor

# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
# proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],

)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
Expand Down Expand Up @@ -76,8 +79,9 @@ Optional
├── job_type (str):
| fulltime, parttime, internship, contract
├── proxy (str):
| in format 'http://user:pass@host:port'
├── proxies ():
| in format ['user:pass@host:port', 'localhost']
| each job board will round robin through the proxies
├── is_remote (bool)
Expand Down Expand Up @@ -201,7 +205,7 @@ You can specify the following countries when searching on Indeed (use the exact
## Notes
* Indeed is the best scraper currently with no rate limiting.
* All the job board endpoints are capped at around 1000 jobs on a given search.
* LinkedIn is the most restrictive and usually rate limits around the 10th page.
* LinkedIn is the most restrictive and usually rate limits around the 10th page with one ip. Proxies are a must basically.

## Frequently Asked Questions

Expand All @@ -216,7 +220,7 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
**Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:

- Waiting some time between scrapes (site-dependent).
- Trying a VPN or proxy to change your IP address.
- Wait some time between scrapes (site-dependent).
- Try using the proxies param to change your IP address.

---
30 changes: 0 additions & 30 deletions examples/JobSpy_AllSites.py

This file was deleted.

167 changes: 0 additions & 167 deletions examples/JobSpy_Demo.ipynb

This file was deleted.

78 changes: 0 additions & 78 deletions examples/JobSpy_LongScrape.py

This file was deleted.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.53"
version = "1.1.54"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
4 changes: 2 additions & 2 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def scrape_jobs(
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: str | None = None,
proxies: list[str] | str | None = None,
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
Expand Down Expand Up @@ -96,7 +96,7 @@ def get_site_type():

def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)
scraper = scraper_class(proxies=proxies)
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
Expand Down
4 changes: 2 additions & 2 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ class ScraperInput(BaseModel):


class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None):
def __init__(self, site: Site, proxies: list[str] | None = None):
self.proxies = proxies
self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)

@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
7 changes: 3 additions & 4 deletions src/jobspy/scrapers/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@


class GlassdoorScraper(Scraper):
def __init__(self, proxy: Optional[str] = None):
def __init__(self, proxies: list[str] | str | None = None):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy)
super().__init__(site, proxies=proxies)

self.base_url = None
self.country = None
Expand All @@ -59,7 +59,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url()

self.session = create_session(self.proxy, is_tls=True, has_retry=True)
self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token

Expand Down Expand Up @@ -245,7 +245,6 @@ def _get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote:
return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True)
res = self.session.get(url, headers=self.headers)
if res.status_code != 200:
if res.status_code == 429:
Expand Down
Loading

0 comments on commit 5cb7ffe

Please sign in to comment.