diff --git a/LICENSE b/LICENSE index 515ec81..8f095ef 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Cullen Watson +Copyright (c) 2024 Cullen Watson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 23a2008..ebcbbcb 100644 --- a/README.md +++ b/README.md @@ -21,33 +21,35 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) ### Usage ```python -from staffspy import scrape_staff, SolverType +from staffspy import LinkedInAccount, SolverType from pathlib import Path + session_file = Path(__file__).resolve().parent / "session.pkl" +account = LinkedInAccount( + ## credentials - remove these to sign in with browser + username="myemail@gmail.com", + password="mypassword", + solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha + solver_service=SolverType.CAPSOLVER, + + session_file=str(session_file), # save login cookies to only log in once (lasts a week or so) + log_level=1, # 0 for no logs +) -staff = scrape_staff( - ## staff filters +# search by company +staff = account.scrape_staff( company_name="openai", search_term="software engineer", location="london", extra_profile_data=True, # fetch all past experiences, schools, & skills - ## - - ## config max_results=50, # can go up to 1000 - session_file=str(session_file), # save login cookies to only log in once (lasts a week or so) - log_level=1, # 0 for no logs - ## - - ## credentials - remove these to sign in with browser - username="myemail@gmail.com", - password="mypassword", - solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha - solver_service=SolverType.CAPSOLVER - ## ) -filename = "staff.csv" -staff.to_csv(filename, index=False) +# or fetch by user ids +users = account.scrape_users( + user_ids=['williamhgates', 'rbranson', 'jeffweiner08'] +) +staff.to_csv("staff.csv", index=False) +users.to_csv("users.csv", index=False) ``` #### Browser login diff --git a/pyproject.toml b/pyproject.toml index 770e7a6..a5712ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.2.5" +version = "0.2.6" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" diff --git a/staffspy/__init__.py b/staffspy/__init__.py index 370e4d7..06f31f0 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -1,75 +1,115 @@ import pandas as pd from staffspy.linkedin.linkedin import LinkedInScraper +from staffspy.utils.models import Staff from staffspy.solvers.capsolver import CapSolver from staffspy.solvers.solver_type import SolverType from staffspy.solvers.two_captcha import TwoCaptchaSolver +from staffspy.utils.utils import set_logger_level, logger, Login -from staffspy.utils import set_logger_level, logger, Login +class LinkedInAccount: + solver_map = { + SolverType.CAPSOLVER: CapSolver, + SolverType.TWO_CAPTCHA: TwoCaptchaSolver + } -def scrape_staff( - *, - company_name: str = None, - user_id: str = None, - session_file: str = None, - search_term: str = None, - location: str = None, - extra_profile_data: bool = False, - max_results: int = 1000, - log_level: int = 0, - username: str = None, - password: str = None, - solver_api_key: str = None, - solver_service: SolverType = SolverType.CAPSOLVER + def __init__( + self, + session_file: str = None, + username: str = None, + password: str = None, + log_level: int = 0, + solver_api_key: str = None, + solver_service: SolverType = SolverType.CAPSOLVER + ): + self.session_file = session_file + self.username = username + self.password = password + self.log_level = log_level + self.solver = self.solver_map[solver_service](solver_api_key) + self.session = None + self.linkedin_scraper = None + self.login() -) -> pd.DataFrame: - """Scrape staff from Linkedin - company_name - name of company to find staff frame - user_id - alternative to company_name, fetches the company_name from the user profile - session_file - place to save cookies to only sign in once - search_term - occupation / term to search for at the company - location - filter for staff at a location - extra_profile_data - fetches staff's experiences, schools, and mor - max_results - amount of results you desire - log_level - level of logs, 0 for no logs, 2 for all - usernme,password - for requests based sign in - solver_api_key,solver_service - options to bypass captcha - """ - set_logger_level(log_level) + def login(self): + set_logger_level(self.log_level) + login = Login(self.username, self.password, self.solver, self.session_file) + self.session = login.load_session() - solver=None - if solver_service == SolverType.CAPSOLVER: - solver = CapSolver(solver_api_key) - elif solver_service == SolverType.TWO_CAPTCHA: - solver = TwoCaptchaSolver(solver_api_key) - login = Login(username, password, solver, session_file) - session = login.load_session() + def scrape_staff( + self, + company_name: str = None, + user_id: str = None, + search_term: str = None, + location: str = None, + extra_profile_data: bool = False, + max_results: int = 1000 + ) -> pd.DataFrame: + """Scrape staff from Linkedin + company_name - name of company to find staff frame + user_id - alternative to company_name, fetches the company_name from the user profile + search_term - occupation / term to search for at the company + location - filter for staff at a location + extra_profile_data - fetches staff's experiences, schools, and mor + max_results - amount of results you desire + """ + li_scraper = LinkedInScraper(self.session) - li = LinkedInScraper(session) + if not company_name: + if not user_id: + raise ValueError("Either company_name or user_id must be provided") + company_name = li_scraper.fetch_user_profile_data_from_public_id('company_id') - if not company_name: - if not user_id: - raise ValueError("Either company_name or user_id must be provided") + staff = li_scraper.scrape_staff( + company_name=company_name, + extra_profile_data=extra_profile_data, + search_term=search_term, + location=location, + max_results=max_results, + ) + staff_dicts = [staff.to_dict() for staff in staff] + staff_df = pd.DataFrame(staff_dicts) - company_name = li.fetch_company_id_from_user(user_id) + if staff_df.empty: + return staff_df + linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] + non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] + staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) + logger.info(f"Scraped {len(staff_df)} staff members from {company_name}") + return staff_df - staff = li.scrape_staff( - company_name=company_name, - extra_profile_data=extra_profile_data, - search_term=search_term, - location=location, - max_results=max_results, - ) - staff_dicts = [staff.to_dict() for staff in staff] - staff_df = pd.DataFrame(staff_dicts) + def scrape_users( + self, + user_ids: list[str] + ) -> pd.DataFrame: + """Scrape users from Linkedin by user IDs + user_ids - list of LinkedIn user IDs + """ + li_scraper = LinkedInScraper(self.session) + li_scraper.num_staff = len(user_ids) + users = [ + Staff( + id='', + search_term='manual', + profile_id=user_id, + ) for user_id in user_ids + ] - if staff_df.empty: - return staff_df - linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] - non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] - staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) - logger.info( - f"Scraped {len(staff_df)} staff members, with {len(linkedin_member_df)} hidden LinkedIn Members." - ) - return staff_df + for i, user in enumerate(users,start=1): + user.id = li_scraper.fetch_user_profile_data_from_public_id(user.profile_id, 'user_id') + if user.id: + li_scraper.fetch_all_info_for_employee( + user, i + ) + + users_dicts = [user.to_dict() for user in users if user.id] + users_df = pd.DataFrame(users_dicts) + + if users_df.empty: + return users_df + linkedin_member_df = users_df[users_df["name"] == "LinkedIn Member"] + non_linkedin_member_df = users_df[users_df["name"] != "LinkedIn Member"] + users_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) + logger.info(f"Scraped {len(users_df)} users") + return users_df diff --git a/staffspy/linkedin/certifications.py b/staffspy/linkedin/certifications.py index bb05d5a..f19e490 100644 --- a/staffspy/linkedin/certifications.py +++ b/staffspy/linkedin/certifications.py @@ -1,8 +1,8 @@ import json import logging -from staffspy.exceptions import TooManyRequests -from staffspy.models import Certification +from staffspy.utils.exceptions import TooManyRequests +from staffspy.utils.models import Certification logger = logging.getLogger(__name__) diff --git a/staffspy/linkedin/employee.py b/staffspy/linkedin/employee.py index 91b0a09..c80a04e 100644 --- a/staffspy/linkedin/employee.py +++ b/staffspy/linkedin/employee.py @@ -2,8 +2,9 @@ import logging import re -import staffspy.utils as utils -from staffspy.exceptions import TooManyRequests +import staffspy.utils.utils as utils +from staffspy.utils.exceptions import TooManyRequests +from staffspy.utils.models import Staff logger = logging.getLogger(__name__) @@ -40,7 +41,7 @@ def fetch_employee(self, base_staff, domain): self.parse_emp(base_staff, employee_json) return True - def parse_emp(self, emp, emp_dict): + def parse_emp(self, emp: Staff, emp_dict: dict): """Parse the employee data from the employee profile.""" try: photo_data = emp_dict["profilePicture"]["displayImageReference"][ @@ -53,7 +54,14 @@ def parse_emp(self, emp, emp_dict): profile_photo = None emp.profile_id = emp_dict["publicIdentifier"] + try: + emp.headline = emp_dict.get('headline') + if not emp.headline: + emp.headline = emp_dict['memberRelationship']['memberRelationshipData']['noInvitation']['targetInviteeResolutionResult']['headline'] + except: + pass emp.is_connection = next(iter(emp_dict['memberRelationship']['memberRelationshipUnion'])) == 'connection' + emp.open_to_work = emp_dict['profilePicture'].get('frameType')=='OPEN_TO_WORK' emp.profile_link = f'https://www.linkedin.com/in/{emp_dict["publicIdentifier"]}' diff --git a/staffspy/linkedin/employee_bio.py b/staffspy/linkedin/employee_bio.py index 386bc2d..585bf97 100644 --- a/staffspy/linkedin/employee_bio.py +++ b/staffspy/linkedin/employee_bio.py @@ -1,7 +1,7 @@ import json import logging -from staffspy.exceptions import TooManyRequests +from staffspy.utils.exceptions import TooManyRequests logger = logging.getLogger(__name__) diff --git a/staffspy/linkedin/experiences.py b/staffspy/linkedin/experiences.py index bfd1c7f..b632e8a 100644 --- a/staffspy/linkedin/experiences.py +++ b/staffspy/linkedin/experiences.py @@ -1,9 +1,9 @@ import json import logging -import staffspy.utils as utils -from staffspy.exceptions import TooManyRequests -from staffspy.models import Experience +import staffspy.utils.utils as utils +from staffspy.utils.exceptions import TooManyRequests +from staffspy.utils.models import Experience logger = logging.getLogger(__name__) diff --git a/staffspy/linkedin/linkedin.py b/staffspy/linkedin/linkedin.py index 55cba31..c5249b3 100644 --- a/staffspy/linkedin/linkedin.py +++ b/staffspy/linkedin/linkedin.py @@ -1,5 +1,5 @@ """ -staffspy.linkedin +staffspy.linkedin.linkedin ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape LinkedIn. @@ -12,16 +12,16 @@ import requests -import staffspy.utils as utils -from staffspy.exceptions import TooManyRequests, BadCookies, GeoUrnNotFound +import staffspy.utils.utils as utils +from staffspy.utils.exceptions import TooManyRequests, BadCookies, GeoUrnNotFound from staffspy.linkedin.certifications import CertificationFetcher from staffspy.linkedin.employee import EmployeeFetcher from staffspy.linkedin.employee_bio import EmployeeBioFetcher from staffspy.linkedin.experiences import ExperiencesFetcher from staffspy.linkedin.schools import SchoolsFetcher from staffspy.linkedin.skills import SkillsFetcher -from staffspy.models import Staff -from staffspy.utils import logger +from staffspy.utils.models import Staff +from staffspy.utils.utils import logger class LinkedInScraper: @@ -29,7 +29,7 @@ class LinkedInScraper: company_id_ep = "https://www.linkedin.com/voyager/api/organization/companies?q=universalName&universalName=" company_search_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashClusters.02af3bc8bc85a169bb76bb4805d05759&queryName=SearchClusterCollection&variables=(query:(flagshipSearchIntent:SEARCH_SRP,keywords:{company},includeFiltersInResponse:false,queryParameters:(keywords:List({company}),resultType:List(COMPANIES))),count:10,origin:GLOBAL_SEARCH_HEADER,start:0)" location_id_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashReusableTypeahead.57a4fa1dd92d3266ed968fdbab2d7bf5&queryName=SearchReusableTypeaheadByType&variables=(query:(showFullLastNameForConnections:false,typeaheadFilterQuery:(geoSearchTypes:List(MARKET_AREA,COUNTRY_REGION,ADMIN_DIVISION_1,CITY))),keywords:{location},type:GEO,start:0)" - get_company_from_user_ep = "https://www.linkedin.com/voyager/api/identity/profiles/{user_id}/profileView" + public_user_id_ep = "https://www.linkedin.com/voyager/api/identity/profiles/{user_id}/profileView" def __init__(self, session: requests.Session): self.session = session @@ -138,7 +138,7 @@ def parse_staff(self, elements): linkedin_id = match.group(1) name = person["title"]["text"].strip() - position = ( + headline = ( person.get("primarySubtitle", {}).get("text", "") if person.get("primarySubtitle") else "" @@ -147,7 +147,7 @@ def parse_staff(self, elements): Staff( id=linkedin_id, name=name, - position=position, + headline=headline, search_term=" - ".join( filter( None, @@ -272,7 +272,7 @@ def scrape_staff( try: for i, employee in enumerate(non_restricted, start=1): self.fetch_all_info_for_employee(employee, i) - except (BadCookies, TooManyRequests) as e: + except TooManyRequests as e: logger.error(str(e)) return reduced_staff_list @@ -283,38 +283,42 @@ def fetch_all_info_for_employee(self, employee: Staff, index: int): f"Fetching employee data for {employee.id} {index} / {self.num_staff}" ) - with ThreadPoolExecutor(max_workers=5) as executor: - tasks = {} - tasks[ - executor.submit(self.employees.fetch_employee, employee, self.domain) - ] = "employee" - tasks[executor.submit(self.skills.fetch_skills, employee)] = "skills" - tasks[executor.submit(self.experiences.fetch_experiences, employee)] = ( - "experiences" - ) - tasks[executor.submit(self.certs.fetch_certifications, employee)] = ( - "certifications" - ) - tasks[executor.submit(self.schools.fetch_schools, employee)] = "schools" - tasks[executor.submit(self.bio.fetch_employee_bio, employee)] = "bio" + with ThreadPoolExecutor(max_workers=6) as executor: + tasks = {executor.submit(self.employees.fetch_employee, employee, self.domain): "employee", + executor.submit(self.skills.fetch_skills, employee): "skills", + executor.submit(self.experiences.fetch_experiences, employee): ( + "experiences" + ), executor.submit(self.certs.fetch_certifications, employee): ( + "certifications" + ), executor.submit(self.schools.fetch_schools, employee): "schools", + executor.submit(self.bio.fetch_employee_bio, employee): "bio"} for future in as_completed(tasks): result = future.result() - if isinstance(result, TooManyRequests): - logger.debug(f"API rate limit exceeded for {tasks[future]}") - raise TooManyRequests( - f"Stopping due to API rate limit exceeded for {tasks[future]}" - ) - def fetch_company_id_from_user(self, user_id: str): - ep = self.get_company_from_user_ep.format(user_id=user_id) - res = self.session.get(ep) + def fetch_user_profile_data_from_public_id(self, user_id: str, key: str): + """Fetches data given the public LinkedIn user id""" + endpoint = self.public_user_id_ep.format(user_id=user_id) + response = self.session.get(endpoint) + try: - res_json = res.json() + response_json = response.json() except json.decoder.JSONDecodeError: - logger.debug(res.text[:200]) - raise Exception(f'Failed to load json in fetch_comany_id_from_user', res.status_code) + logger.debug(response.text[:200]) + raise Exception(f'Failed to load JSON from endpoint', response.status_code, response.reason) + + keys = { + 'user_id': ('positionView', 'profileId'), + 'company_id': ('positionView', 'elements', 0, 'company', 'miniCompany', 'universalName') + } + try: - return res_json['positionView']['elements'][0]['company']['miniCompany']['universalName'] - except: - raise Exception(f'Failed to fetch company for user_id {user_id}') + data = response_json + for k in keys[key]: + data = data[k] + return data + except (KeyError, TypeError, IndexError) as e: + logger.warning(f"Failed to find user_id {user_id}") + if key=='user_id': + return '' + raise Exception(f"Failed to fetch '{key}' for user_id {user_id}: {e}") diff --git a/staffspy/linkedin/schools.py b/staffspy/linkedin/schools.py index f8785aa..a7cc109 100644 --- a/staffspy/linkedin/schools.py +++ b/staffspy/linkedin/schools.py @@ -1,9 +1,9 @@ import json import logging -from staffspy.exceptions import TooManyRequests -from staffspy.models import School -from staffspy.utils import parse_dates +from staffspy.utils.exceptions import TooManyRequests +from staffspy.utils.models import School +from staffspy.utils.utils import parse_dates logger = logging.getLogger(__name__) diff --git a/staffspy/linkedin/skills.py b/staffspy/linkedin/skills.py index 3be466a..38a530a 100644 --- a/staffspy/linkedin/skills.py +++ b/staffspy/linkedin/skills.py @@ -1,8 +1,8 @@ import json import logging -from staffspy.exceptions import TooManyRequests -from staffspy.models import Skill +from staffspy.utils.exceptions import TooManyRequests +from staffspy.utils.models import Skill logger = logging.getLogger(__name__) diff --git a/staffspy/solvers/capsolver.py b/staffspy/solvers/capsolver.py index 4f07027..0d17af1 100644 --- a/staffspy/solvers/capsolver.py +++ b/staffspy/solvers/capsolver.py @@ -16,7 +16,7 @@ class CapSolver(Solver): @retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none)) def solve(self, blob_data: str, page_url: str=None): - from staffspy.utils import logger + from staffspy.utils.utils import logger logger.info(f'Waiting on CapSolver to solve captcha...') payload = { diff --git a/staffspy/solvers/solver.py b/staffspy/solvers/solver.py index 7d5cbfc..8df56db 100644 --- a/staffspy/solvers/solver.py +++ b/staffspy/solvers/solver.py @@ -1,5 +1,6 @@ from abc import ABC,abstractmethod + class Solver(ABC): public_key = "3117BF26-4762-4F5A-8ED9-A85E69209A46" page_url = "https://iframe.arkoselabs.com" diff --git a/staffspy/solvers/two_captcha.py b/staffspy/solvers/two_captcha.py index 4ef361c..f45b4a1 100644 --- a/staffspy/solvers/two_captcha.py +++ b/staffspy/solvers/two_captcha.py @@ -12,7 +12,7 @@ class TwoCaptchaSolver(Solver): @retry(stop=stop_after_attempt(5), retry=retry_if_exception_type((TimeoutException, ApiException))) def solve(self, blob_data: str, page_url:str=None): super().solve(blob_data, page_url) - from staffspy.utils import logger + from staffspy.utils.utils import logger logger.info(f'Waiting on 2Captcha to solve captcha attempt {self.attempt} / 5 ...') self.attempt+=1 diff --git a/staffspy/exceptions.py b/staffspy/utils/exceptions.py similarity index 100% rename from staffspy/exceptions.py rename to staffspy/utils/exceptions.py diff --git a/staffspy/models.py b/staffspy/utils/models.py similarity index 87% rename from staffspy/models.py rename to staffspy/utils/models.py index 21f6749..e9d006c 100644 --- a/staffspy/models.py +++ b/staffspy/utils/models.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from staffspy.utils import extract_emails_from_text +from staffspy.utils.utils import extract_emails_from_text class School(BaseModel): @@ -72,8 +72,9 @@ def to_dict(self): class Staff(BaseModel): search_term: str id: str - name: str - position: str | None = None + name: str | None = None + headline: str | None = None + current_position: str | None = None profile_id: str | None = None profile_link: str | None = None @@ -92,6 +93,7 @@ class Staff(BaseModel): influencer: bool | None = None creator: bool | None = None premium: bool | None = None + open_to_work: bool | None = None profile_photo: str | None = None skills: list[Skill] | None = None experiences: list[Experience] | None = None @@ -132,17 +134,19 @@ def to_dict(self): top_three_companies += [None] * (3 - len(top_three_companies)) top_three_skills=self.get_top_skills() + name = filter(None, [self.first_name, self.last_name]) self.emails_in_bio=extract_emails_from_text(self.bio) if self.bio else None + self.current_position = sorted_experiences[0].title if len(sorted_experiences) > 0 and sorted_experiences[0].end_date is None else None return { "search_term": self.search_term, "id": self.id, "profile_id": self.profile_id, - "name": self.name, + "name": self.name if self.name else ' '.join(name) if name else None, "first_name": self.first_name, "last_name": self.last_name, "location": self.location, - "position": self.position, + "headline": self.headline, "estimated_age": estimated_age, "followers": self.followers, "connections": self.connections, @@ -151,14 +155,16 @@ def to_dict(self): "premium": self.premium, "creator": self.creator, "influencer": self.influencer, - "company_1": top_three_companies[0], - "company_2": top_three_companies[1], - "company_3": top_three_companies[2], + "open_to_work": self.open_to_work, + "current_position":self.current_position, + "current_company": top_three_companies[0], + "past_company_1": top_three_companies[1], + "past_company_2": top_three_companies[2], "school_1": top_three_school_names[0], "school_2": top_three_school_names[1], - "skill_1": top_three_skills[0], - "skill_2": top_three_skills[1], - "skill_3": top_three_skills[2], + "top_skill_1": top_three_skills[0], + "top_skill_2": top_three_skills[1], + "top_skill_3": top_three_skills[2], "bio": self.bio, "experiences": ( [exp.to_dict() for exp in self.experiences] diff --git a/staffspy/utils.py b/staffspy/utils/utils.py similarity index 99% rename from staffspy/utils.py rename to staffspy/utils/utils.py index 79370f7..22071b0 100644 --- a/staffspy/utils.py +++ b/staffspy/utils/utils.py @@ -2,7 +2,6 @@ import os import pickle import re -import time from datetime import datetime from urllib.parse import quote from dateutil.parser import parse @@ -12,7 +11,7 @@ from bs4 import BeautifulSoup from tenacity import stop_after_attempt, retry_if_exception_type, retry, RetryError -from staffspy.exceptions import BlobException +from staffspy.utils.exceptions import BlobException from staffspy.solvers.solver import Solver logger = logging.getLogger("StaffSpy") @@ -199,6 +198,7 @@ def save_session(self, session, session_file: str): pickle.dump(data, f) def load_session(self): + """Load session from session file, otherwise login""" session=None if not self.session_file or not os.path.exists(self.session_file): if self.username and self.password: