diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py index 477873c..44a31a1 100644 --- a/app/celery_broker/metadata_utils.py +++ b/app/celery_broker/metadata_utils.py @@ -87,9 +87,6 @@ def metadata_task( task_name=metadata_type, task=task, ) - crawls.update_status( - crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR - ) continue except Exception as e: logger.error( @@ -102,8 +99,5 @@ def metadata_task( task_name=metadata_type, task=task, ) - crawls.update_status( - crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR - ) continue return handle_metadata_result(task, crawl_process, result, metadata_type) diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index ec55c71..d40b034 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -1,8 +1,9 @@ # Standard library imports +import asyncio import os import pathlib import shutil -from multiprocessing import Process, Manager +from multiprocessing import Manager, Process from typing import Optional # Local imports @@ -49,14 +50,17 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: kwargs={"crawl_process": crawl_process, "results": shared_dict}, ) p.start() - p.join() # TODO define and add a timeout + p.join(120) # Wait 120 seconds for the crawler to finish + if p.is_alive(): + logger.error("Crawler timed out, the crawl may not contain enough pages") + p.terminate() + p.join() + crawl_process.metadata.update(shared_dict["metadata"]) + except Exception as e: logger.error(f"Error while crawling html files: {e}") set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) - crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.ERROR - ) self.update_state(state='FAILURE') return crawl_process try: @@ -66,9 +70,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: logger.error(f"Error while uploading html files: {e}") # Html crawl will be considered failed if we can't upload the html files set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) - crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.ERROR - ) self.update_state(state='FAILURE') return crawl_process @@ -131,10 +132,27 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C # Retrieve the current status of the crawl current_crawl = crawls.get(crawl_id=crawl.id) + have_success = False + + # Retrieve the status of all the sub tasks (html_crawl, lighthouse, technologies, responsiveness, carbon_footprint) + # If one of the sub tasks failed, we consider the crawl as partially failed + all_tasks = [current_crawl.html_crawl, current_crawl.lighthouse, current_crawl.technologies_and_trackers, current_crawl.responsiveness, current_crawl.carbon_footprint] + for task in all_tasks: + if task is None: + continue + if task.status != ProcessStatus.SUCCESS: + current_crawl.status = ProcessStatus.PARTIAL_ERROR + else: + have_success = True + if current_crawl.status == ProcessStatus.STARTED: - crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.SUCCESS - ) + current_crawl.status = ProcessStatus.SUCCESS + elif not have_success: + current_crawl.status = ProcessStatus.ERROR + + crawls.update_status( + crawl_id=crawl.id, status=current_crawl.status + ) websites.store_last_crawl( website_id=crawl.website_id, diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index 7241e9b..1ed6ef7 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -3,6 +3,7 @@ # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from pathlib import Path +from urllib.parse import urlparse from app.config import settings @@ -48,7 +49,7 @@ def from_crawler(cls, crawler): def _format_file_path(self, response, spider) -> Path: domain = spider.allowed_domains[0] base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}" - file_name = response.url.split(f"{domain}")[-1].lstrip('/') + file_name = response.url.split(f"{domain}")[-1].strip('/') if not file_name.endswith(".html"): file_name = f"{file_name}.html" if file_name == ".html": @@ -63,12 +64,27 @@ def _save_html_locally(self, response, spider): file_path.write_text(response.text) def process_response(self, request, response, spider): + if self.page_limit != 0 and self.current_page_count >= self.page_limit: raise IgnoreRequest( f"Page limit reached. Ignoring request {request}" ) + if request.url.endswith("robots.txt"): return response + + if spider.first_real_url is None: + # We should set the allowed_url and allowed_domains only once for the first request. + # It is useful when we have a permanent redirection in the first request. + spider.first_real_url = response.url + parsed_url = urlparse(spider.first_real_url) + if parsed_url.path: + spider.allowed_url = parsed_url.path + else: + spider.allowed_url = parsed_url.netloc + spider.allowed_domains = [parsed_url.netloc] + + if response.status == 200: self.current_page_count += 1 self._save_html_locally(response, spider) diff --git a/app/crawler/spider.py b/app/crawler/spider.py index e56ee28..e56559e 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -1,5 +1,3 @@ -from urllib.parse import urlparse - from scrapy import Request from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule @@ -11,18 +9,15 @@ class MenesrSpider(CrawlSpider): rules = (Rule(),) use_playwright = False allowed_url = None + first_real_url = None page_count = 0 page_limit = 0 depth_limit = 0 def __init__(self, crawl_process: CrawlProcess, *a, **kw): - parsed_url = urlparse(crawl_process.config.url) self.use_playwright = crawl_process.config.parameters.use_playwright - if parsed_url.path: - self.allowed_url = parsed_url.path self.page_limit = crawl_process.config.parameters.limit self.depth_limit = crawl_process.config.parameters.depth - self.allowed_domains = [parsed_url.netloc] self.start_urls = [crawl_process.config.url] self.crawl_process = crawl_process super().__init__(*a, **kw) @@ -30,18 +25,17 @@ def __init__(self, crawl_process: CrawlProcess, *a, **kw): def start_requests(self): for url in self.start_urls: + meta = { + "depth": 0, # Set the initial depth to 0 + } if self.use_playwright: - yield Request(url, self.parse, meta={ - "depth": 0, # Set the initial depth to 0 + meta.update({ "playwright": True, "playwright_page_methods": [ ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') - ] - }) - else: - yield Request(url, self.parse, meta={ - "depth": 0, # Set the initial depth to 0 + ], }) + yield Request(url, self.parse, meta=meta) def parse(self, response, **kwargs): # Crawl the links in the response page and continue to crawl the next page diff --git a/app/repositories/crawls.py b/app/repositories/crawls.py index b16d7dd..5f795e1 100644 --- a/app/repositories/crawls.py +++ b/app/repositories/crawls.py @@ -67,7 +67,7 @@ def update_status(self, crawl_id: str, status: ProcessStatus): update_dict = {"status": status} if status == ProcessStatus.STARTED: update_dict["started_at"] = french_datetime() - if status == ProcessStatus.SUCCESS: + else: update_dict["finished_at"] = french_datetime() self.collection.update_one( filter={"id": crawl_id}, diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py index b301d6c..5243f6e 100644 --- a/tests/tests_crawler/test_menesr.py +++ b/tests/tests_crawler/test_menesr.py @@ -20,7 +20,6 @@ def test_init_without_path(self): self.mock_crawl_process.config.url = "http://example.com/" spider = MenesrSpider(self.mock_crawl_process) - self.assertEqual(spider.allowed_domains, ["example.com"]) self.assertEqual(spider.start_urls, ["http://example.com/"]) self.assertTrue(isinstance(spider.rules, tuple)) @@ -28,8 +27,6 @@ def test_init_with_path(self): spider = MenesrSpider(self.mock_crawl_process) # Checking initialized values - parsed_url = urlparse(self.mock_crawl_process.config.url) - self.assertEqual(spider.allowed_domains, [parsed_url.netloc]) self.assertEqual( spider.start_urls, [self.mock_crawl_process.config.url] )