Skip to content

Commit

Permalink
fix: job types
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Feb 12, 2024
1 parent 91b137e commit babbc09
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 100 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.43"
version = "1.1.44"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
2 changes: 1 addition & 1 deletion src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,4 @@ def worker(site):
else:
jobs_formatted_df = pd.DataFrame()

return jobs_formatted_df.sort_values(by='date_posted', ascending=False)
return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
195 changes: 174 additions & 21 deletions src/jobspy/scrapers/glassdoor/__init__.py

Large diffs are not rendered by default.

60 changes: 9 additions & 51 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from typing import Any
from datetime import datetime

import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
Expand All @@ -22,7 +21,7 @@
extract_emails_from_text,
create_session,
get_enum_from_job_type,
modify_and_get_description
logger
)
from ...jobs import (
JobPost,
Expand Down Expand Up @@ -57,6 +56,8 @@ def scrape_page(
:param page:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
total_num_jobs = 0
self.country = scraper_input.country
domain = self.country.indeed_domain_value
self.url = f"https://{domain}.indeed.com"
Expand All @@ -76,11 +77,12 @@ def scrape_page(
)
except Exception as e:
if "Proxy responded with" in str(e):
raise IndeedException("bad proxy")
raise IndeedException(str(e))
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list, total_num_jobs

soup = BeautifulSoup(response.content, "html.parser")
job_list = []
total_num_jobs = IndeedScraper.total_jobs(soup)
if "did not match any jobs" in response.text:
return job_list, total_num_jobs
Expand Down Expand Up @@ -188,50 +190,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
)
return job_response

def get_description(self, job_page_url: str) -> str | None:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
session = create_session(self.proxy)

try:
response = session.get(
formatted_url,
headers=self.get_headers(),
allow_redirects=True,
timeout_seconds=5,
)
except Exception as e:
return None

if response.status_code not in range(200, 400):
return None

try:
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')

job_description = ''
for tag in script_tags:
if 'window._initialData' in tag.text:
json_str = tag.text
json_str = json_str.split('window._initialData=')[1]
json_str = json_str.rsplit(';', 1)[0]
data = json.loads(json_str)
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
break
except (KeyError, TypeError, IndexError):
return None

soup = BeautifulSoup(job_description, "html.parser")
return modify_and_get_description(soup)

@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
"""
Expand Down Expand Up @@ -380,7 +338,7 @@ def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
if scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if scraper_input.job_type:
sc_values.append("jt({})".format(scraper_input.job_type.value))
sc_values.append("jt({})".format(scraper_input.job_type.value[0]))

if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
Expand All @@ -406,7 +364,7 @@ def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
for taxonomy in job.get("taxonomyAttributes", [])
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy

def get_job_details(self, job_keys: list[str]) -> dict:
"""
Expand Down
14 changes: 9 additions & 5 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
count_urgent_words,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
modify_and_get_description
currency_parser
)


Expand Down Expand Up @@ -236,10 +235,15 @@ def get_job_description(
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)

description = None
if div_content:
description = modify_and_get_description(div_content)
if div_content is not None:
def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag

div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")

def get_job_type(
soup_job_type: BeautifulSoup,
Expand Down
20 changes: 12 additions & 8 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import logging
import numpy as np

import tls_client
Expand All @@ -7,14 +8,14 @@

from ..jobs import JobType


def modify_and_get_description(soup):
for li in soup.find_all('li'):
li.string = "- " + li.get_text()

description = soup.get_text(separator='\n').strip()
description = re.sub(r'\n+', '\n', description)
return description
logger = logging.getLogger("JobSpy")
if not logger.handlers:
logger.setLevel(logging.ERROR)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


def count_urgent_words(description: str) -> int:
Expand Down Expand Up @@ -79,6 +80,7 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
res = job_type
return res


def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
Expand All @@ -94,3 +96,5 @@ def currency_parser(cur_str):
num = float(cur_str)

return np.round(num, 2)


22 changes: 9 additions & 13 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
from datetime import datetime, timezone
from typing import Optional, Tuple, Any

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
from ..utils import count_urgent_words, extract_emails_from_text, create_session


class ZipRecruiterScraper(Scraper):
Expand Down Expand Up @@ -107,9 +106,7 @@ def process_job(self, job: dict) -> JobPost | None:
return
self.seen_urls.add(job_url)

job_description_html = job.get("job_description", "").strip()
description_soup = BeautifulSoup(job_description_html, "html.parser")
description = modify_and_get_description(description_soup)
description = job.get("job_description", "").strip()

company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada"
Expand Down Expand Up @@ -169,23 +166,22 @@ def add_params(scraper_input) -> dict[str, str | Any]:
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
params['days'] = fromage
job_type_value = None
job_type_map = {
JobType.FULL_TIME: 'full_time',
JobType.PART_TIME: 'part_time'
}
if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime":
job_type_value = "full_time"
elif scraper_input.job_type.value == "parttime":
job_type_value = "part_time"
else:
job_type_value = scraper_input.job_type.value
params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
if scraper_input.easy_apply:
params['zipapply'] = 1

if job_type_value:
params[
"refine_by_employment"
"empl"
] = f"employment_type:employment_type:{job_type_value}"

if scraper_input.is_remote:
params["refine_by_location_type"] = "only_remote"
params["remote"] = 1

if scraper_input.distance:
params["radius"] = scraper_input.distance
Expand Down

0 comments on commit babbc09

Please sign in to comment.