From eed7fca300cc4ad5002d6f9e37dacf3e57946297 Mon Sep 17 00:00:00 2001 From: Vincent Yan Date: Mon, 27 Nov 2023 16:00:36 -0500 Subject: [PATCH] Get full indeed description (#70) --- pyproject.toml | 2 +- src/jobspy/jobs/__init__.py | 2 +- src/jobspy/scrapers/indeed/__init__.py | 19 ++----------------- 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 08272d6..f71bb68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.28" +version = "1.1.29" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index f1fd708..ae68be4 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -121,7 +121,7 @@ class Country(Enum): # internal for ziprecruiter US_CANADA = ("usa/ca", "www") - # internal for linkeind + # internal for linkedin WORLDWIDE = ("worldwide", "www") @property diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 61c19f7..c758043 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -235,24 +235,9 @@ def get_description(self, job_page_url: str) -> str | None: if response.status_code not in range(200, 400): return None - soup = BeautifulSoup(response.text, "html.parser") - script_tag = soup.find( - "script", text=lambda x: x and "window._initialData" in x - ) - - if not script_tag: - return None - - script_code = script_tag.string - match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S) - - if not match: - return None - - json_string = match.group(1) - data = json.loads(json_string) try: - job_description = data["jobInfoWrapperModel"]["jobInfoModel"][ + data = json.loads(response.text) + job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][ "sanitizedJobDescription" ] except (KeyError, TypeError, IndexError):