dataesr · folland87 · Dec 15, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 12, 2023
diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py
@@ -87,9 +87,6 @@ def metadata_task(
                         task_name=metadata_type,
                         task=task,
                     )
-                    crawls.update_status(
-                        crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
-                    )
                 continue
             except Exception as e:
                 logger.error(
@@ -102,8 +99,5 @@ def metadata_task(
                         task_name=metadata_type,
                         task=task,
                     )
-                    crawls.update_status(
-                        crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
-                    )
                 continue
     return handle_metadata_result(task, crawl_process, result, metadata_type)
diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py
@@ -1,8 +1,9 @@
 # Standard library imports
+import asyncio
 import os
 import pathlib
 import shutil
-from multiprocessing import Process, Manager
+from multiprocessing import Manager, Process
 from typing import Optional
 
 # Local imports
@@ -49,14 +50,17 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
                 kwargs={"crawl_process": crawl_process, "results": shared_dict},
             )
             p.start()
-            p.join()  # TODO define and add a timeout
+            p.join(120)  # Wait 120 seconds for the crawler to finish
+            if p.is_alive():
+                logger.error("Crawler timed out, the crawl may not contain enough pages")
+                p.terminate()
+                p.join()
+
             crawl_process.metadata.update(shared_dict["metadata"])
+
     except Exception as e:
         logger.error(f"Error while crawling html files: {e}")
         set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
-        crawls.update_status(
-            crawl_id=crawl.id, status=ProcessStatus.ERROR
-        )
         self.update_state(state='FAILURE')
         return crawl_process
     try:
@@ -66,9 +70,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
         logger.error(f"Error while uploading html files: {e}")
         # Html crawl will be considered failed if we can't upload the html files
         set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
-        crawls.update_status(
-            crawl_id=crawl.id, status=ProcessStatus.ERROR
-        )
         self.update_state(state='FAILURE')
         return crawl_process
 
@@ -131,10 +132,27 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C
     # Retrieve the current status of the crawl
     current_crawl = crawls.get(crawl_id=crawl.id)
 
+    have_success = False
+
+    # Retrieve the status of all the sub tasks (html_crawl, lighthouse, technologies, responsiveness, carbon_footprint)
+    # If one of the sub tasks failed, we consider the crawl as partially failed
+    all_tasks = [current_crawl.html_crawl, current_crawl.lighthouse, current_crawl.technologies_and_trackers, current_crawl.responsiveness, current_crawl.carbon_footprint]
+    for task in all_tasks:
+        if task is None:
+            continue
+        if task.status != ProcessStatus.SUCCESS:
+            current_crawl.status = ProcessStatus.PARTIAL_ERROR
+        else:
+            have_success = True
+
     if current_crawl.status == ProcessStatus.STARTED:
-        crawls.update_status(
-            crawl_id=crawl.id, status=ProcessStatus.SUCCESS
-        )
+        current_crawl.status = ProcessStatus.SUCCESS
+    elif not have_success:
+        current_crawl.status = ProcessStatus.ERROR
+
+    crawls.update_status(
+        crawl_id=crawl.id, status=current_crawl.status
+    )
 
     websites.store_last_crawl(
         website_id=crawl.website_id,

diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py
@@ -3,6 +3,7 @@
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from pathlib import Path
+from urllib.parse import urlparse
 
 from app.config import settings
 
@@ -48,7 +49,7 @@ def from_crawler(cls, crawler):
     def _format_file_path(self, response, spider) -> Path:
         domain = spider.allowed_domains[0]
         base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}"
-        file_name = response.url.split(f"{domain}")[-1].lstrip('/')
+        file_name = response.url.split(f"{domain}")[-1].strip('/')
         if not file_name.endswith(".html"):
             file_name = f"{file_name}.html"
         if file_name == ".html":
@@ -63,12 +64,27 @@ def _save_html_locally(self, response, spider):
         file_path.write_text(response.text)
 
     def process_response(self, request, response, spider):
+
         if self.page_limit != 0 and self.current_page_count >= self.page_limit:
             raise IgnoreRequest(
                 f"Page limit reached. Ignoring request {request}"
             )
+
         if request.url.endswith("robots.txt"):
             return response
+
+        if spider.first_real_url is None:
+            # We should set the allowed_url and allowed_domains only once for the first request.
+            # It is useful when we have a permanent redirection in the first request.
+            spider.first_real_url = response.url
+            parsed_url = urlparse(spider.first_real_url)
+            if parsed_url.path:
+                spider.allowed_url = parsed_url.path
+            else:
+                spider.allowed_url = parsed_url.netloc
+            spider.allowed_domains = [parsed_url.netloc]
+
+
         if response.status == 200:
             self.current_page_count += 1
             self._save_html_locally(response, spider)

diff --git a/app/crawler/spider.py b/app/crawler/spider.py
@@ -1,5 +1,3 @@
-from urllib.parse import urlparse
-
 from scrapy import Request
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
@@ -11,37 +9,33 @@ class MenesrSpider(CrawlSpider):
     rules = (Rule(),)
     use_playwright = False
     allowed_url = None
+    first_real_url = None
     page_count = 0
     page_limit = 0
     depth_limit = 0
 
     def __init__(self, crawl_process: CrawlProcess, *a, **kw):
-        parsed_url = urlparse(crawl_process.config.url)
         self.use_playwright = crawl_process.config.parameters.use_playwright
-        if parsed_url.path:
-            self.allowed_url = parsed_url.path
         self.page_limit = crawl_process.config.parameters.limit
         self.depth_limit = crawl_process.config.parameters.depth
-        self.allowed_domains = [parsed_url.netloc]
         self.start_urls = [crawl_process.config.url]
         self.crawl_process = crawl_process
         super().__init__(*a, **kw)
 
 
     def start_requests(self):
         for url in self.start_urls:
+            meta = {
+                "depth": 0,  # Set the initial depth to 0
+            }
             if self.use_playwright:
-                yield Request(url, self.parse, meta={
-                    "depth": 0, # Set the initial depth to 0
+                meta.update({
                     "playwright": True,
                     "playwright_page_methods": [
                         ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
-                    ]
-                })
-            else:
-                yield Request(url, self.parse, meta={
-                    "depth": 0, # Set the initial depth to 0
+                    ],
                 })
+            yield Request(url, self.parse, meta=meta)
 
     def parse(self, response, **kwargs):
         # Crawl the links in the response page and continue to crawl the next page

diff --git a/app/repositories/crawls.py b/app/repositories/crawls.py
@@ -67,7 +67,7 @@ def update_status(self, crawl_id: str, status: ProcessStatus):
         update_dict = {"status": status}
         if status == ProcessStatus.STARTED:
             update_dict["started_at"] = french_datetime()
-        if status == ProcessStatus.SUCCESS:
+        else:
             update_dict["finished_at"] = french_datetime()
         self.collection.update_one(
             filter={"id": crawl_id},

diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py
@@ -20,16 +20,13 @@ def test_init_without_path(self):
         self.mock_crawl_process.config.url = "http://example.com/"
         spider = MenesrSpider(self.mock_crawl_process)
 
-        self.assertEqual(spider.allowed_domains, ["example.com"])
         self.assertEqual(spider.start_urls, ["http://example.com/"])
         self.assertTrue(isinstance(spider.rules, tuple))
 
     def test_init_with_path(self):
         spider = MenesrSpider(self.mock_crawl_process)
 
         # Checking initialized values
-        parsed_url = urlparse(self.mock_crawl_process.config.url)
-        self.assertEqual(spider.allowed_domains, [parsed_url.netloc])
         self.assertEqual(
             spider.start_urls, [self.mock_crawl_process.config.url]
         )