fix: job types

Bunsly · Feb 12, 2024 · babbc09 · babbc09
1 parent 91b137e
commit babbc09
Show file tree

Hide file tree

Showing 7 changed files with 215 additions and 100 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.43"
+version = "1.1.44"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
 homepage = "https://github.com/Bunsly/JobSpy"

diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -192,4 +192,4 @@ def worker(site):
     else:
         jobs_formatted_df = pd.DataFrame()
 
-    return jobs_formatted_df.sort_values(by='date_posted', ascending=False)
+    return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py
diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py
@@ -11,7 +11,6 @@
 from typing import Any
 from datetime import datetime
 
-import urllib.parse
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from concurrent.futures import ThreadPoolExecutor, Future
@@ -22,7 +21,7 @@
     extract_emails_from_text,
     create_session,
     get_enum_from_job_type,
-    modify_and_get_description
+    logger
 )
 from ...jobs import (
     JobPost,
@@ -57,6 +56,8 @@ def scrape_page(
         :param page:
         :return: jobs found on page, total number of jobs found for search
         """
+        job_list = []
+        total_num_jobs = 0
         self.country = scraper_input.country
         domain = self.country.indeed_domain_value
         self.url = f"https://{domain}.indeed.com"
@@ -76,11 +77,12 @@ def scrape_page(
                 )
         except Exception as e:
             if "Proxy responded with" in str(e):
-                raise IndeedException("bad proxy")
-            raise IndeedException(str(e))
+                logger.error(f'Indeed: Bad proxy')
+            else:
+                logger.error(f'Indeed: {str(e)}')
+            return job_list, total_num_jobs
 
         soup = BeautifulSoup(response.content, "html.parser")
-        job_list = []
         total_num_jobs = IndeedScraper.total_jobs(soup)
         if "did not match any jobs" in response.text:
             return job_list, total_num_jobs
@@ -188,50 +190,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
         )
         return job_response
 
-    def get_description(self, job_page_url: str) -> str | None:
-        """
-        Retrieves job description by going to the job page url
-        :param job_page_url:
-        :return: description
-        """
-        parsed_url = urllib.parse.urlparse(job_page_url)
-        params = urllib.parse.parse_qs(parsed_url.query)
-        jk_value = params.get("jk", [None])[0]
-        formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
-        session = create_session(self.proxy)
-
-        try:
-            response = session.get(
-                formatted_url,
-                headers=self.get_headers(),
-                allow_redirects=True,
-                timeout_seconds=5,
-            )
-        except Exception as e:
-            return None
-
-        if response.status_code not in range(200, 400):
-            return None
-
-        try:
-            soup = BeautifulSoup(response.text, 'html.parser')
-            script_tags = soup.find_all('script')
-
-            job_description = ''
-            for tag in script_tags:
-                if 'window._initialData' in tag.text:
-                    json_str = tag.text
-                    json_str = json_str.split('window._initialData=')[1]
-                    json_str = json_str.rsplit(';', 1)[0]
-                    data = json.loads(json_str)
-                    job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
-                    break
-        except (KeyError, TypeError, IndexError):
-            return None
-
-        soup = BeautifulSoup(job_description, "html.parser")
-        return modify_and_get_description(soup)
-
     @staticmethod
     def get_job_type(job: dict) -> list[JobType] | None:
         """
@@ -380,7 +338,7 @@ def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
         if scraper_input.is_remote:
             sc_values.append("attr(DSQF7)")
         if scraper_input.job_type:
-            sc_values.append("jt({})".format(scraper_input.job_type.value))
+            sc_values.append("jt({})".format(scraper_input.job_type.value[0]))
 
         if sc_values:
             params["sc"] = "0kf:" + "".join(sc_values) + ";"
@@ -406,7 +364,7 @@ def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
             taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
             for taxonomy in job.get("taxonomyAttributes", [])
         )
-        return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
+        return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy
 
     def get_job_details(self, job_keys: list[str]) -> dict:
         """

diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
@@ -31,8 +31,7 @@
     count_urgent_words,
     extract_emails_from_text,
     get_enum_from_job_type,
-    currency_parser,
-    modify_and_get_description
+    currency_parser
 )
 
 
@@ -236,10 +235,15 @@ def get_job_description(
         div_content = soup.find(
             "div", class_=lambda x: x and "show-more-less-html__markup" in x
         )
-
         description = None
-        if div_content:
-            description = modify_and_get_description(div_content)
+        if div_content is not None:
+            def remove_attributes(tag):
+                for attr in list(tag.attrs):
+                    del tag[attr]
+                return tag
+
+            div_content = remove_attributes(div_content)
+            description = div_content.prettify(formatter="html")
 
         def get_job_type(
             soup_job_type: BeautifulSoup,

diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py
@@ -1,4 +1,5 @@
 import re
+import logging
 import numpy as np
 
 import tls_client
@@ -7,14 +8,14 @@
 
 from ..jobs import JobType
 
-
-def modify_and_get_description(soup):
-    for li in soup.find_all('li'):
-        li.string = "- " + li.get_text()
-
-    description = soup.get_text(separator='\n').strip()
-    description = re.sub(r'\n+', '\n', description)
-    return description
+logger = logging.getLogger("JobSpy")
+if not logger.handlers:
+    logger.setLevel(logging.ERROR)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.ERROR)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
 
 
 def count_urgent_words(description: str) -> int:
@@ -79,6 +80,7 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
             res = job_type
     return res
 
+
 def currency_parser(cur_str):
     # Remove any non-numerical characters
     # except for ',' '.' or '-' (e.g. EUR)
@@ -94,3 +96,5 @@ def currency_parser(cur_str):
         num = float(cur_str)
 
     return np.round(num, 2)
+
+
diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py
@@ -9,13 +9,12 @@
 from datetime import datetime, timezone
 from typing import Optional, Tuple, Any
 
-from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor
 
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import ZipRecruiterException
 from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
-from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
+from ..utils import count_urgent_words, extract_emails_from_text, create_session
 
 
 class ZipRecruiterScraper(Scraper):
@@ -107,9 +106,7 @@ def process_job(self, job: dict) -> JobPost | None:
             return
         self.seen_urls.add(job_url)
 
-        job_description_html = job.get("job_description", "").strip()
-        description_soup = BeautifulSoup(job_description_html, "html.parser")
-        description = modify_and_get_description(description_soup)
+        description = job.get("job_description", "").strip()
 
         company = job.get("hiring_company", {}).get("name")
         country_value = "usa" if job.get("job_country") == "US" else "canada"
@@ -169,23 +166,22 @@ def add_params(scraper_input) -> dict[str, str | Any]:
             fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
             params['days'] = fromage
         job_type_value = None
+        job_type_map = {
+            JobType.FULL_TIME: 'full_time',
+            JobType.PART_TIME: 'part_time'
+        }
         if scraper_input.job_type:
-            if scraper_input.job_type.value == "fulltime":
-                job_type_value = "full_time"
-            elif scraper_input.job_type.value == "parttime":
-                job_type_value = "part_time"
-            else:
-                job_type_value = scraper_input.job_type.value
+            params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
         if scraper_input.easy_apply:
             params['zipapply'] = 1
 
         if job_type_value:
             params[
-                "refine_by_employment"
+                "empl"
             ] = f"employment_type:employment_type:{job_type_value}"
 
         if scraper_input.is_remote:
-            params["refine_by_location_type"] = "only_remote"
+            params["remote"] = 1
 
         if scraper_input.distance:
             params["radius"] = scraper_input.distance