From 770072e721f28180b2f96861d255e2ea31bedf64 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Mon, 11 Dec 2023 15:44:24 +0100 Subject: [PATCH 01/11] fix: correct filename strip "/" to avoir having '/.html' as filename --- app/crawler/middlewares.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index 7241e9b..18b133d 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -48,7 +48,7 @@ def from_crawler(cls, crawler): def _format_file_path(self, response, spider) -> Path: domain = spider.allowed_domains[0] base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}" - file_name = response.url.split(f"{domain}")[-1].lstrip('/') + file_name = response.url.split(f"{domain}")[-1].strip('/') if not file_name.endswith(".html"): file_name = f"{file_name}.html" if file_name == ".html": From 2188a83e8abe4a306247d8c80fe29414b6c02aff Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Mon, 11 Dec 2023 16:02:50 +0100 Subject: [PATCH 02/11] fix: strop creating sub-process for html crawl Avoid the error "daemonic processes are not allowed to have children" when we use prefork --- app/celery_broker/crawler_utils.py | 3 +-- app/celery_broker/tasks.py | 10 +--------- docker-compose.yml | 2 ++ 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py index 6b31e6d..9f3059f 100644 --- a/app/celery_broker/crawler_utils.py +++ b/app/celery_broker/crawler_utils.py @@ -33,11 +33,10 @@ def init_crawler_settings(crawl_process: CrawlProcess): return settings -def start_crawler_process(crawl_process: CrawlProcess, results: dict): +def start_crawler_process(crawl_process: CrawlProcess): process = CrawlerProcess(settings=init_crawler_settings(crawl_process)) process.crawl(MenesrSpider, crawl_process=crawl_process) process.start() - results["metadata"] = dict(crawl_process.metadata.items()) def set_html_crawl_status(crawl: CrawlModel, request_id: str, status: ProcessStatus): diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index ec55c71..a9cac78 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -42,15 +42,7 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: crawl_process = CrawlProcess.from_model(crawl) try: - with Manager() as manager: - shared_dict = manager.dict() - p = Process( - target=start_crawler_process, - kwargs={"crawl_process": crawl_process, "results": shared_dict}, - ) - p.start() - p.join() # TODO define and add a timeout - crawl_process.metadata.update(shared_dict["metadata"]) + start_crawler_process(crawl_process=crawl_process) except Exception as e: logger.error(f"Error while crawling html files: {e}") set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) diff --git a/docker-compose.yml b/docker-compose.yml index 1d5cbc5..88aa510 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,6 +32,8 @@ services: build: . env_file: .env command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P solo -n crawl_worker -Q crawl_queue,finalize_crawl_queue' + # An example with prefork. + #command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P prefork -c 4 -n crawl_worker -Q crawl_queue,finalize_crawl_queue' volumes: - ./app:/open-crawler/app - local_files:${LOCAL_FILES_PATH} From e26895c6fc23f401e77ea0af0ede581e3d775637 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Tue, 12 Dec 2023 15:15:03 +0100 Subject: [PATCH 03/11] Fix: MSPDT76 - update finished_at --- app/celery_broker/tasks.py | 9 +++++---- app/repositories/crawls.py | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index a9cac78..c05ea53 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -2,7 +2,6 @@ import os import pathlib import shutil -from multiprocessing import Process, Manager from typing import Optional # Local imports @@ -124,9 +123,11 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C current_crawl = crawls.get(crawl_id=crawl.id) if current_crawl.status == ProcessStatus.STARTED: - crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.SUCCESS - ) + current_crawl.status = ProcessStatus.SUCCESS + + crawls.update_status( + crawl_id=crawl.id, status=current_crawl.status, final_status=True + ) websites.store_last_crawl( website_id=crawl.website_id, diff --git a/app/repositories/crawls.py b/app/repositories/crawls.py index b16d7dd..a9cf48d 100644 --- a/app/repositories/crawls.py +++ b/app/repositories/crawls.py @@ -63,11 +63,14 @@ def update(self, data: CrawlModel): }, ) - def update_status(self, crawl_id: str, status: ProcessStatus): + def update_status(self, crawl_id: str, status: ProcessStatus, final_status: bool = False): update_dict = {"status": status} if status == ProcessStatus.STARTED: update_dict["started_at"] = french_datetime() - if status == ProcessStatus.SUCCESS: + elif status == ProcessStatus.SUCCESS: + update_dict["finished_at"] = french_datetime() + # In finalize task, we should update the finished_at field regardless of the status + if final_status: update_dict["finished_at"] = french_datetime() self.collection.update_one( filter={"id": crawl_id}, From 9ed4077d79fd718eac5796c8354209f723da05f1 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Wed, 13 Dec 2023 17:27:43 +0100 Subject: [PATCH 04/11] MSPDT-80: update crawl process status if a metadata fails --- app/celery_broker/metadata_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py index 477873c..ef54133 100644 --- a/app/celery_broker/metadata_utils.py +++ b/app/celery_broker/metadata_utils.py @@ -25,6 +25,11 @@ def handle_metadata_result( task_name=metadata_type, task=task, ) + # when the metadata task fails, the crawl process is considered partially failed. + # Because metadata tasks are exécuted only if the html_crawl is successful, + crawls.update_status( + crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR + ) logger.error(f"{metadata_type} failed.") return crawl_process store_metadata_result(crawl_process, result, metadata_type) From 6d0fd2003324c757619b6b9c676ce109a71cb8fa Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 10:22:19 +0100 Subject: [PATCH 05/11] Revert "MSPDT-80: update crawl process status if a metadata fails" This reverts commit 9ed4077d79fd718eac5796c8354209f723da05f1. --- app/celery_broker/metadata_utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py index ef54133..477873c 100644 --- a/app/celery_broker/metadata_utils.py +++ b/app/celery_broker/metadata_utils.py @@ -25,11 +25,6 @@ def handle_metadata_result( task_name=metadata_type, task=task, ) - # when the metadata task fails, the crawl process is considered partially failed. - # Because metadata tasks are exécuted only if the html_crawl is successful, - crawls.update_status( - crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR - ) logger.error(f"{metadata_type} failed.") return crawl_process store_metadata_result(crawl_process, result, metadata_type) From f448e867c1a9e9d42b28b48912c37f1a66eacf12 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 14:12:45 +0100 Subject: [PATCH 06/11] Allow the first url of the response. --- app/crawler/middlewares.py | 16 ++++++++++++++++ app/crawler/spider.py | 20 +++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index 18b133d..1ed6ef7 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -3,6 +3,7 @@ # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from pathlib import Path +from urllib.parse import urlparse from app.config import settings @@ -63,12 +64,27 @@ def _save_html_locally(self, response, spider): file_path.write_text(response.text) def process_response(self, request, response, spider): + if self.page_limit != 0 and self.current_page_count >= self.page_limit: raise IgnoreRequest( f"Page limit reached. Ignoring request {request}" ) + if request.url.endswith("robots.txt"): return response + + if spider.first_real_url is None: + # We should set the allowed_url and allowed_domains only once for the first request. + # It is useful when we have a permanent redirection in the first request. + spider.first_real_url = response.url + parsed_url = urlparse(spider.first_real_url) + if parsed_url.path: + spider.allowed_url = parsed_url.path + else: + spider.allowed_url = parsed_url.netloc + spider.allowed_domains = [parsed_url.netloc] + + if response.status == 200: self.current_page_count += 1 self._save_html_locally(response, spider) diff --git a/app/crawler/spider.py b/app/crawler/spider.py index e56ee28..e56559e 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -1,5 +1,3 @@ -from urllib.parse import urlparse - from scrapy import Request from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule @@ -11,18 +9,15 @@ class MenesrSpider(CrawlSpider): rules = (Rule(),) use_playwright = False allowed_url = None + first_real_url = None page_count = 0 page_limit = 0 depth_limit = 0 def __init__(self, crawl_process: CrawlProcess, *a, **kw): - parsed_url = urlparse(crawl_process.config.url) self.use_playwright = crawl_process.config.parameters.use_playwright - if parsed_url.path: - self.allowed_url = parsed_url.path self.page_limit = crawl_process.config.parameters.limit self.depth_limit = crawl_process.config.parameters.depth - self.allowed_domains = [parsed_url.netloc] self.start_urls = [crawl_process.config.url] self.crawl_process = crawl_process super().__init__(*a, **kw) @@ -30,18 +25,17 @@ def __init__(self, crawl_process: CrawlProcess, *a, **kw): def start_requests(self): for url in self.start_urls: + meta = { + "depth": 0, # Set the initial depth to 0 + } if self.use_playwright: - yield Request(url, self.parse, meta={ - "depth": 0, # Set the initial depth to 0 + meta.update({ "playwright": True, "playwright_page_methods": [ ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') - ] - }) - else: - yield Request(url, self.parse, meta={ - "depth": 0, # Set the initial depth to 0 + ], }) + yield Request(url, self.parse, meta=meta) def parse(self, response, **kwargs): # Crawl the links in the response page and continue to crawl the next page From 669b990518a8fd7ab320b07693161e3a654dfbfd Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 14:17:54 +0100 Subject: [PATCH 07/11] fix: correct tests (alllowed domains are not set at the initialisation anymore) --- tests/tests_crawler/test_menesr.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py index b301d6c..5243f6e 100644 --- a/tests/tests_crawler/test_menesr.py +++ b/tests/tests_crawler/test_menesr.py @@ -20,7 +20,6 @@ def test_init_without_path(self): self.mock_crawl_process.config.url = "http://example.com/" spider = MenesrSpider(self.mock_crawl_process) - self.assertEqual(spider.allowed_domains, ["example.com"]) self.assertEqual(spider.start_urls, ["http://example.com/"]) self.assertTrue(isinstance(spider.rules, tuple)) @@ -28,8 +27,6 @@ def test_init_with_path(self): spider = MenesrSpider(self.mock_crawl_process) # Checking initialized values - parsed_url = urlparse(self.mock_crawl_process.config.url) - self.assertEqual(spider.allowed_domains, [parsed_url.netloc]) self.assertEqual( spider.start_urls, [self.mock_crawl_process.config.url] ) From b58e64b0bd0cf3435c4f0979de6beb7e69c745c9 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 15:00:57 +0100 Subject: [PATCH 08/11] MSPDT-80: Check all the task status in finalize_crawl task --- app/celery_broker/tasks.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index c05ea53..06451fe 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -122,8 +122,23 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C # Retrieve the current status of the crawl current_crawl = crawls.get(crawl_id=crawl.id) + have_success = False + + # Retrieve the status of all the sub tasks (html_crawl, lighthouse, technologies, responsiveness, carbon_footprint) + # If one of the sub tasks failed, we consider the crawl as partially failed + all_tasks = [current_crawl.html_crawl, current_crawl.lighthouse, current_crawl.technologies_and_trackers, current_crawl.responsiveness, current_crawl.carbon_footprint] + for task in all_tasks: + if task is None: + continue + if task.status != ProcessStatus.SUCCESS: + current_crawl.status = ProcessStatus.PARTIAL_ERROR + else: + have_success = True + if current_crawl.status == ProcessStatus.STARTED: current_crawl.status = ProcessStatus.SUCCESS + elif not have_success: + current_crawl.status = ProcessStatus.ERROR crawls.update_status( crawl_id=crawl.id, status=current_crawl.status, final_status=True From 055fff6462040d1abe8f9a021709e1d1487c2160 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 15:10:06 +0100 Subject: [PATCH 09/11] fix: only calculate crawl statut at the beginning and the end of the crawl --- app/celery_broker/metadata_utils.py | 6 ------ app/celery_broker/tasks.py | 8 +------- app/repositories/crawls.py | 7 ++----- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py index 477873c..44a31a1 100644 --- a/app/celery_broker/metadata_utils.py +++ b/app/celery_broker/metadata_utils.py @@ -87,9 +87,6 @@ def metadata_task( task_name=metadata_type, task=task, ) - crawls.update_status( - crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR - ) continue except Exception as e: logger.error( @@ -102,8 +99,5 @@ def metadata_task( task_name=metadata_type, task=task, ) - crawls.update_status( - crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR - ) continue return handle_metadata_result(task, crawl_process, result, metadata_type) diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index 06451fe..3f3dbb0 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -45,9 +45,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: except Exception as e: logger.error(f"Error while crawling html files: {e}") set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) - crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.ERROR - ) self.update_state(state='FAILURE') return crawl_process try: @@ -57,9 +54,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: logger.error(f"Error while uploading html files: {e}") # Html crawl will be considered failed if we can't upload the html files set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) - crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.ERROR - ) self.update_state(state='FAILURE') return crawl_process @@ -141,7 +135,7 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C current_crawl.status = ProcessStatus.ERROR crawls.update_status( - crawl_id=crawl.id, status=current_crawl.status, final_status=True + crawl_id=crawl.id, status=current_crawl.status ) websites.store_last_crawl( diff --git a/app/repositories/crawls.py b/app/repositories/crawls.py index a9cf48d..5f795e1 100644 --- a/app/repositories/crawls.py +++ b/app/repositories/crawls.py @@ -63,14 +63,11 @@ def update(self, data: CrawlModel): }, ) - def update_status(self, crawl_id: str, status: ProcessStatus, final_status: bool = False): + def update_status(self, crawl_id: str, status: ProcessStatus): update_dict = {"status": status} if status == ProcessStatus.STARTED: update_dict["started_at"] = french_datetime() - elif status == ProcessStatus.SUCCESS: - update_dict["finished_at"] = french_datetime() - # In finalize task, we should update the finished_at field regardless of the status - if final_status: + else: update_dict["finished_at"] = french_datetime() self.collection.update_one( filter={"id": crawl_id}, From 56a59f766d8ef1d879c5df8721e430971f975bd3 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 17:10:02 +0100 Subject: [PATCH 10/11] Revert "fix: strop creating sub-process for html crawl" This reverts commit 2188a83e8abe4a306247d8c80fe29414b6c02aff. --- app/celery_broker/crawler_utils.py | 3 ++- app/celery_broker/tasks.py | 10 +++++++++- docker-compose.yml | 2 -- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py index 9f3059f..6b31e6d 100644 --- a/app/celery_broker/crawler_utils.py +++ b/app/celery_broker/crawler_utils.py @@ -33,10 +33,11 @@ def init_crawler_settings(crawl_process: CrawlProcess): return settings -def start_crawler_process(crawl_process: CrawlProcess): +def start_crawler_process(crawl_process: CrawlProcess, results: dict): process = CrawlerProcess(settings=init_crawler_settings(crawl_process)) process.crawl(MenesrSpider, crawl_process=crawl_process) process.start() + results["metadata"] = dict(crawl_process.metadata.items()) def set_html_crawl_status(crawl: CrawlModel, request_id: str, status: ProcessStatus): diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index 3f3dbb0..498d91c 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -41,7 +41,15 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: crawl_process = CrawlProcess.from_model(crawl) try: - start_crawler_process(crawl_process=crawl_process) + with Manager() as manager: + shared_dict = manager.dict() + p = Process( + target=start_crawler_process, + kwargs={"crawl_process": crawl_process, "results": shared_dict}, + ) + p.start() + p.join() # TODO define and add a timeout + crawl_process.metadata.update(shared_dict["metadata"]) except Exception as e: logger.error(f"Error while crawling html files: {e}") set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) diff --git a/docker-compose.yml b/docker-compose.yml index 88aa510..1d5cbc5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,8 +32,6 @@ services: build: . env_file: .env command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P solo -n crawl_worker -Q crawl_queue,finalize_crawl_queue' - # An example with prefork. - #command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P prefork -c 4 -n crawl_worker -Q crawl_queue,finalize_crawl_queue' volumes: - ./app:/open-crawler/app - local_files:${LOCAL_FILES_PATH} From bc6ed5e2ffb26722000cdd1df52ed9cb825cddff Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Thu, 14 Dec 2023 17:32:50 +0100 Subject: [PATCH 11/11] MSPDT-75: Define timeout for html_crawl process --- app/celery_broker/tasks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index 498d91c..d40b034 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -1,7 +1,9 @@ # Standard library imports +import asyncio import os import pathlib import shutil +from multiprocessing import Manager, Process from typing import Optional # Local imports @@ -48,8 +50,14 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: kwargs={"crawl_process": crawl_process, "results": shared_dict}, ) p.start() - p.join() # TODO define and add a timeout + p.join(120) # Wait 120 seconds for the crawler to finish + if p.is_alive(): + logger.error("Crawler timed out, the crawl may not contain enough pages") + p.terminate() + p.join() + crawl_process.metadata.update(shared_dict["metadata"]) + except Exception as e: logger.error(f"Error while crawling html files: {e}") set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)