diff --git a/.env b/.env index 4bd1584..60dbdc5 100644 --- a/.env +++ b/.env @@ -1,14 +1,15 @@ MODE=dev -#Volumes Config +CONFIG_PROFILE=development + +# Volumes Config LOCAL_FILES_PATH=/mounted/local_files/ # TODO: Change value before build MINIO_PATH=/mounted/minio/ # TODO: Change value before build MONGODB_PATH=/mounted/mongodb/ # TODO: Change value before build -#Storage Service Config - -#API_KEYS -GOOGLE_API_KEY=CHANGEME +# API_KEYS +GOOGLE_API_KEY=AIzaSyAfdZFZM1mz7IYUgCpESSJX4zdJZ589eX0 +# Scrapy SCRAPY_SETTINGS_MODULE=app.crawler.settings C_FORCE_ROOT=True @@ -24,10 +25,9 @@ STORAGE_SERVICE_USERNAME=admin STORAGE_SERVICE_PASSWORD=password123 STORAGE_SERVICE_URL=minio:9000 STORAGE_SERVICE_REGION=gra +STORAGE_SERVICE_SECURE=false STORAGE_SERVICE_BUCKET_NAME=open-crawler HTML_FOLDER_NAME=html # TODO: Change value before build METADATA_FOLDER_NAME=metadata # TODO: Change value before build -LOGGER_LEVEL=INFO - DEFAULT_RECRAWL_INTERVAL=30 \ No newline at end of file diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index a585c93..65e787f 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -20,6 +20,9 @@ jobs: steps: - uses: actions/checkout@v3 + - name : Install Packages + run : pip install -r requirements.txt + - name: test run: python -m unittest diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..c977bc4 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,22 @@ +name: Testing deployment + +on: + push: + +jobs: + unit-test: + name: run unit tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v1 + with: + python-version: 3.11 + + - name : Install Packages + run : pip install -r requirements.txt + + - name: test + run: python -m unittest \ No newline at end of file diff --git a/README.md b/README.md index 961db5e..1e80c06 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,7 @@ This endpoint allows you to create a new website configuration end execute a cra | `depth` | `integer` | Maximum depth to crawl (**Default**: 2) | | `limit` | `integer` | Maximum pages to crawl (**Default**: 400) | | `headers` | `dict[str, str]` | Headers that will be passed to all crawl requests (**Default**: {})| -| `accessibility` | `MetadataConfig` | Accessibility configuration (**Default**: {'enabled':True, 'depth' 0}) | -| `good_practices` | `MetadataConfig` | Good Practices configuration (**Default**: {'enabled': False}) | +| `lighthouse` | `MetadataConfig` | Lighthouse configuration (**Default**: {'enabled':True, 'depth' 0}) | | `technologies` | `MetadataConfig` | Technologies configuration (**Default**: {'enabled': False}) | | `responsiveness` | `MetadataConfig` | Responsiveness configuration (**Default**: {'enabled': False}) | | `carbon_footprint` | `MetadataConfig` | Carbon Footprint configuration (**Default**: {'enabled': False}) | @@ -132,7 +131,9 @@ To access the two collections, use a MongoDB console (such as MongoDB Compass fo **website_crawl_parameters** collection: ![mongodb_config](./demo/mongodb_crawl_configuration.png)## Acces simple storage service -At the end of the crawl process, all crawled html pages and metadata files are uploaded to a simple storage service (s3). +At the end of the crawl process, all crawled html pages are uploaded to a simple storage service (s3). +The metadata are directly uploaded to the storage service. + The docker-compose file deploys a MinIO service that can be accessed at http://localhost:9090. (by default) ![minio](./demo/minio.png) \ No newline at end of file diff --git a/app/api/crawls_router.py b/app/api/crawls_router.py index fb9780a..3724ebd 100644 --- a/app/api/crawls_router.py +++ b/app/api/crawls_router.py @@ -1,10 +1,5 @@ -import io -import os -from zipfile import ZipFile, ZIP_DEFLATED - from fastapi import HTTPException, APIRouter, status as statuscode from fastapi.responses import StreamingResponse -from minio import Minio import app.repositories as repositories from app.api.utils import create_crawl, start_crawl @@ -58,34 +53,23 @@ def list_crawls( status_code=statuscode.HTTP_200_OK, summary="Get a zip of all files from a crawl", ) -def get_crawl_files(website_id: str, crawl_id: str) -> StreamingResponse: +def get_crawl_files(crawl_id: str) -> StreamingResponse: """Zip the files from the storage service""" - client = Minio( - endpoint=os.environ["STORAGE_SERVICE_URL"], - access_key=os.environ["STORAGE_SERVICE_USERNAME"], - secret_key=os.environ["STORAGE_SERVICE_PASSWORD"], - secure=os.environ.get("STORAGE_SERVICE_SECURE", False), - region=os.environ.get("STORAGE_SERVICE_REGION", None), - ) - - bucket = os.environ["STORAGE_SERVICE_BUCKET_NAME"] - zip_io = io.BytesIO() - if not (crawl := repositories.crawls.get(website_id, crawl_id)): - raise HTTPException( - status_code=statuscode.HTTP_404_NOT_FOUND, - detail="Crawl not found", - ) - url = crawl.config.url.replace("https://", "").replace("http://", "") - prefix = f"{url}/{crawl_id}" - objects = client.list_objects(bucket, prefix=prefix, recursive=True) - with ZipFile(zip_io, "a", ZIP_DEFLATED, False) as zipper: - for obj in objects: - file = client.get_object(bucket, obj.object_name).read() - zipper.writestr(obj.object_name, file) + zip_io = repositories.files.zip_all_crawl_files(crawl_id) return StreamingResponse( iter([zip_io.getvalue()]), media_type="application/x-zip-compressed", headers={ - "Content-Disposition": f"attachment; filename={url}-{crawl_id}.zip" + "Content-Disposition": f"attachment; filename={crawl_id}.zip" }, ) + + +@crawls_router.delete( + "/{website_id}/crawls/{crawl_id}", + status_code=statuscode.HTTP_204_NO_CONTENT, + summary="Delete a crawl", +) +def delete_crawl(crawl_id: str) -> None: + """Zip the files from the storage service""" + return repositories.files.delete_all_crawl_files(crawl_id) diff --git a/app/api/factory.py b/app/api/factory.py index 0a33943..79e8d80 100644 --- a/app/api/factory.py +++ b/app/api/factory.py @@ -5,6 +5,7 @@ from app.api.crawls_router import crawls_router from app.api.websites_router import websites_router +from app.config import settings def create_api_app() -> FastAPI: @@ -18,7 +19,7 @@ def create_api_app() -> FastAPI: ) # Configure CORS for non-production modes - deployment_mode = os.environ.get("MODE", "production") + deployment_mode = settings.MODE if deployment_mode != "production": api_app.add_middleware( CORSMiddleware, @@ -27,6 +28,7 @@ def create_api_app() -> FastAPI: allow_methods=["*"], allow_headers=["*"], ) + # TODO: Configure CORS for production mode api_app.include_router(websites_router) api_app.include_router(crawls_router) diff --git a/app/api/utils.py b/app/api/utils.py index b3fddbb..bf5f438 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -1,17 +1,27 @@ +from urllib.parse import urlparse + from celery import group, chain import app.repositories as repositories from app.celery_broker.tasks import ( METADATA_TASK_REGISTRY, start_crawl_process, - upload_html, ) from app.models.crawl import CrawlModel +from app.models.enums import ProcessStatus from app.models.website import WebsiteModel from app.services.crawler_logger import logger def create_crawl(website: WebsiteModel) -> CrawlModel: + + # Check if the path component of the URL is empty or "/" + # If the crawl target is a single page, we will ignore the depth and the limit in the request. + if not is_domain(website.url): + website.depth = 0 + website.limit = 1 + logger.warning("The url to crawl is not a domain. Only one page will be crawled") + crawl: CrawlModel = CrawlModel( website_id=website.id, config=website.to_config(), @@ -32,5 +42,10 @@ def start_crawl(crawl: CrawlModel) -> None: chain( start_crawl_process.s(crawl), metadata_tasks, - upload_html.si(crawl), ).apply_async(task_id=crawl.id) + + +def is_domain(url: str) -> bool: + parsed_url = urlparse(url) + return parsed_url.path == '' or parsed_url.path == '/' + diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py index c9e0f8c..9762cc1 100644 --- a/app/celery_broker/crawler_utils.py +++ b/app/celery_broker/crawler_utils.py @@ -32,3 +32,14 @@ def start_crawler_process(crawl_process: CrawlProcess, results: dict): process.crawl(MenesrSpider, crawl_process=crawl_process) process.start() results["metadata"] = dict(crawl_process.metadata.items()) + + +def set_html_crawl_status(crawl: CrawlModel, request_id: str, status: ProcessStatus): + crawl.html_crawl.update( + task_id=request_id, status=status + ) + repositories.crawls.update_task( + crawl_id=crawl.id, + task_name="html_crawl", + task=crawl.html_crawl, + ) diff --git a/app/celery_broker/factory.py b/app/celery_broker/factory.py index 4c444a1..df2ccbd 100644 --- a/app/celery_broker/factory.py +++ b/app/celery_broker/factory.py @@ -2,14 +2,14 @@ from celery import Celery -from app.celery_broker.config import settings +from app.config import settings def create_celery_app() -> Celery: celery_app = Celery( "scanr", - broker=os.environ.get("CELERY_BROKER_URL"), - backend=os.environ.get("CELERY_RESULT_BACKEND"), + broker=settings.CELERY_BROKER_URL, + backend=settings.CELERY_RESULT_BACKEND, broker_connection_retry_on_startup=True, include=["app.celery_broker.tasks"], ) diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py index fd4f699..57d8b8b 100644 --- a/app/celery_broker/metadata_utils.py +++ b/app/celery_broker/metadata_utils.py @@ -6,10 +6,7 @@ from app.models.enums import MetadataType, ProcessStatus from app.models.metadata import MetadataTask from app.models.process import CrawlProcess -from app.services.accessibility_best_practices_calculator import ( - AccessibilityError, - BestPracticesError, -) +from app.services.lighthouse_calculator import LighthouseError from app.services.carbon_calculator import CarbonCalculatorError from app.services.crawler_logger import logger from app.services.responsiveness_calculator import ResponsivenessCalculatorError @@ -46,14 +43,12 @@ def handle_metadata_result( def store_metadata_result( crawl_process: CrawlProcess, result: dict, metadata_type: MetadataType ): - base_file_path = ( - f"/{os.environ['LOCAL_FILES_PATH'].strip('/')}/{crawl_process.id}" - ) - file_path = pathlib.Path( - f"{base_file_path}/{os.environ['METADATA_FOLDER_NAME'].strip('/')}/{metadata_type}.json" + return repositories.files.store_metadata_file( + crawl_id=crawl_process.id, + object_name=f"{metadata_type}.json", + content_type='application/json', + data=json.dumps(result, indent=2, default=str) ) - file_path.parent.mkdir(exist_ok=True, parents=True) - file_path.write_text(json.dumps(result, indent=4)) def metadata_task( @@ -78,8 +73,7 @@ def metadata_task( data = calc_method(url) result[url] = data except ( - AccessibilityError, - BestPracticesError, + LighthouseError, TechnologiesError, ResponsivenessCalculatorError, CarbonCalculatorError, diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index a9ea05a..a7bc3f4 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -4,21 +4,19 @@ import shutil from multiprocessing import Process, Manager -# Third-party imports -from minio import Minio - # Local imports import app.repositories as repositories -from app.celery_broker.crawler_utils import start_crawler_process +from app.celery_broker.crawler_utils import start_crawler_process, set_html_crawl_status from app.celery_broker.main import celery_app from app.celery_broker.metadata_utils import metadata_task from app.celery_broker.utils import assume_content_type +from app.config import settings from app.models.crawl import CrawlModel from app.models.enums import MetadataType, ProcessStatus from app.models.metadata import MetadataTask from app.models.process import CrawlProcess -from app.services.accessibility_best_practices_calculator import ( - LighthouseWrapper, +from app.services.lighthouse_calculator import ( + LighthouseCalculator, ) from app.services.carbon_calculator import CarbonCalculator from app.services.crawler_logger import logger @@ -36,44 +34,49 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: crawl_id=crawl.id, status=ProcessStatus.STARTED ) logger.debug("Html crawl started!") - crawl.html_crawl.update( - task_id=self.request.id, status=ProcessStatus.STARTED - ) - repositories.crawls.update_task( - crawl_id=crawl.id, - task_name="html_crawl", - task=crawl.html_crawl, - ) + set_html_crawl_status(crawl, self.request.id, ProcessStatus.STARTED) crawl_process = CrawlProcess.from_model(crawl) - with Manager() as manager: - shared_dict = manager.dict() - p = Process( - target=start_crawler_process, - kwargs={"crawl_process": crawl_process, "results": shared_dict}, - ) - p.start() - p.join() # TODO define and add a timeout - crawl_process.metadata.update(shared_dict["metadata"]) - - crawl.html_crawl.update(status=ProcessStatus.SUCCESS) - repositories.crawls.update_task( - crawl_id=crawl.id, - task_name="html_crawl", - task=crawl.html_crawl, - ) + + try: + with Manager() as manager: + shared_dict = manager.dict() + p = Process( + target=start_crawler_process, + kwargs={"crawl_process": crawl_process, "results": shared_dict}, + ) + p.start() + p.join() # TODO define and add a timeout + crawl_process.metadata.update(shared_dict["metadata"]) + except Exception as e: + logger.error(f"Error while crawling html files: {e}") + set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) + self.update_state(state='FAILURE') + return crawl_process + try: + # Attempt to upload HTML files associated with the crawl + upload_html(crawl) + except Exception as e: + logger.error(f"Error while uploading html files: {e}") + # Html crawl will be considered failed if we can't upload the html files + set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR) + self.update_state(state='FAILURE') + return crawl_process + + set_html_crawl_status(crawl, self.request.id, ProcessStatus.SUCCESS) + logger.debug("Html crawl ended!") return crawl_process -@celery_app.task(bind=True, name="get_accessibility") -def get_accessibility(self, crawl_process: CrawlProcess): +@celery_app.task(bind=True, name="get_lighthouse") +def get_lighthouse(self, crawl_process: CrawlProcess): return metadata_task( task=MetadataTask(task_id=self.request.id), crawl_process=crawl_process, - metadata_type=MetadataType.ACCESSIBILITY, - calculator=LighthouseWrapper(), - method_name="get_accessibility", + metadata_type=MetadataType.LIGHTHOUSE, + calculator=LighthouseCalculator(), + method_name="get_lighthouse", ) @@ -88,17 +91,6 @@ def get_technologies(self, crawl_process: CrawlProcess): ) -@celery_app.task(bind=True, name="get_good_practices") -def get_good_practices(self, crawl_process: CrawlProcess): - return metadata_task( - task=MetadataTask(task_id=self.request.id), - crawl_process=crawl_process, - metadata_type=MetadataType.GOOD_PRACTICES, - calculator=LighthouseWrapper(), - method_name="get_best_practices", - ) - - @celery_app.task(bind=True, name="get_responsiveness") def get_responsiveness(self, crawl_process: CrawlProcess): return metadata_task( @@ -121,69 +113,27 @@ def get_carbon_footprint(self, crawl_process: CrawlProcess): ) -@celery_app.task(bind=True, name="upload_html") -def upload_html(self, crawl: CrawlModel): - crawl.uploads.update(task_id=self.request.id, status=ProcessStatus.STARTED) - logger.debug("Files upload started!") - repositories.crawls.update_task( - crawl_id=crawl.id, - task_name="uploads", - task=crawl.uploads, - ) - - client = Minio( - endpoint=os.environ["STORAGE_SERVICE_URL"], - access_key=os.environ["STORAGE_SERVICE_USERNAME"], - secret_key=os.environ["STORAGE_SERVICE_PASSWORD"], - secure=os.environ.get("STORAGE_SERVICE_SECURE", False), - region=os.environ.get("STORAGE_SERVICE_REGION", None), - ) +METADATA_TASK_REGISTRY = { + MetadataType.LIGHTHOUSE: get_lighthouse, + MetadataType.TECHNOLOGIES: get_technologies, + MetadataType.RESPONSIVENESS: get_responsiveness, + MetadataType.CARBON_FOOTPRINT: get_carbon_footprint, +} - bucket_name = os.environ["STORAGE_SERVICE_BUCKET_NAME"] - if not client.bucket_exists(bucket_name): - client.make_bucket(bucket_name) +def upload_html(crawl: CrawlModel): crawl_files_path = pathlib.Path( - f"/{os.environ['LOCAL_FILES_PATH'].strip('/')}/{crawl.id}" + f"/{settings.LOCAL_FILES_PATH.strip('/')}/{crawl.id}" ) - local_files_folder = f"/{os.environ['LOCAL_FILES_PATH'].strip('/')}" - - prefix = crawl.config.url.replace("https://", "").replace("http://", "") + local_files_folder = f"/{settings.LOCAL_FILES_PATH.strip('/')}" for file in crawl_files_path.rglob("*.[hj][ts][mo][ln]"): file_path = str(file) - client.fput_object( - bucket_name=bucket_name, - object_name=f"{prefix.rstrip('/')}/{file_path.removeprefix(local_files_folder).lstrip('/')}", + file_name = file_path.removeprefix(local_files_folder).lstrip('/') + repositories.files.store_html_file( + object_name=file_name, file_path=file_path, content_type=assume_content_type(file_path), ) os.remove(file) shutil.rmtree(crawl_files_path, ignore_errors=True) - crawl.uploads.update(status=ProcessStatus.SUCCESS) - repositories.crawls.update_task( - crawl_id=crawl.id, - task_name="uploads", - task=crawl.uploads, - ) - logger.debug("Files upload ended!") - repositories.crawls.update_status( - crawl_id=crawl.id, status=ProcessStatus.SUCCESS - ) - logger.info( - f"Crawl process ({crawl.id}) for website {crawl.config.url} ended" - ) - - repositories.websites.store_last_crawl( - website_id=crawl.website_id, - crawl=repositories.crawls.get(crawl_id=crawl.id).model_dump(), - ) - - -METADATA_TASK_REGISTRY = { - MetadataType.ACCESSIBILITY: get_accessibility, - MetadataType.TECHNOLOGIES: get_technologies, - MetadataType.GOOD_PRACTICES: get_good_practices, - MetadataType.RESPONSIVENESS: get_responsiveness, - MetadataType.CARBON_FOOTPRINT: get_carbon_footprint, -} diff --git a/app/celery_broker/config.py b/app/config.py similarity index 50% rename from app/celery_broker/config.py rename to app/config.py index 7c96683..f65bc56 100644 --- a/app/celery_broker/config.py +++ b/app/config.py @@ -1,19 +1,49 @@ import os -from functools import lru_cache from kombu import Queue, Exchange class BaseConfig: + """Base configuration.""" + LOGGER_LEVEL = "INFO" + LOGGER_FORMAT = "[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s" + + + DEFAULT_RECRAWL_INTERVAL=os.getenv("DEFAULT_RECRAWL_INTERVAL", 30) + + MODE = os.getenv("MODE", "production") + + # GOOGLE_API_KEY + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + + # Volume + LOCAL_FILES_PATH = os.getenv("LOCAL_FILES_PATH", "/mounted/local_files/") + + # Storage + STORAGE_SERVICE_USERNAME = os.getenv("STORAGE_SERVICE_USERNAME") + STORAGE_SERVICE_PASSWORD = os.getenv("STORAGE_SERVICE_PASSWORD") + STORAGE_SERVICE_URL = os.getenv("STORAGE_SERVICE_URL") + STORAGE_SERVICE_REGION = os.getenv("STORAGE_SERVICE_REGION", default=None) + STORAGE_SERVICE_SECURE = os.getenv("STORAGE_SERVICE_SECURE", default='False').lower() in ('true', '1', 't') + STORAGE_SERVICE_BUCKET_NAME = os.getenv("STORAGE_SERVICE_BUCKET_NAME") + HTML_FOLDER_NAME = os.getenv("HTML_FOLDER_NAME", default="html") + METADATA_FOLDER_NAME = os.getenv("METADATA_FOLDER_NAME", default="metadata") + + # Mongo + MONGO_URI = os.getenv("MONGO_URI", default="mongodb://mongodb:27017") + MONGO_DBNAME = os.getenv("MONGO_DBNAME", default="open-crawler") + MONGO_WEBSITES_COLLECTION = os.getenv("MONGO_WEBSITES_COLLECTION", default="websites") + MONGO_CRAWLS_COLLECTION = os.getenv("MONGO_CRAWLS_COLLECTION", default="crawls") + + # Celery + CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", default="redis://redis:6379") + CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", default="redis://redis:6379") + CRAWL_QUEUE_NAME = "crawl_queue" - ACCESSIBILITY_QUEUE_NAME = "accessibility_queue" + LIGHTHOUSE_QUEUE_NAME = "lighthouse_queue" TECHNOLOGIES_QUEUE_NAME = "technologies_queue" - GOOD_PRACTICES_QUEUE_NAME = "good_practices_queue" RESPONSIVENESS_QUEUE_NAME = "responsiveness_queue" CARBON_QUEUE_NAME = "carbon_footprint_queue" - UPLOAD_QUEUE_NAME = "upload_queue" - - result_backend: str = "redis://redis:6379" # The following two lines make celery execute tasks locally # task_always_eager = True @@ -29,20 +59,15 @@ class BaseConfig: routing_key=CRAWL_QUEUE_NAME, ), Queue( - ACCESSIBILITY_QUEUE_NAME, - Exchange(ACCESSIBILITY_QUEUE_NAME), - routing_key=ACCESSIBILITY_QUEUE_NAME, + LIGHTHOUSE_QUEUE_NAME, + Exchange(LIGHTHOUSE_QUEUE_NAME), + routing_key=LIGHTHOUSE_QUEUE_NAME, ), Queue( TECHNOLOGIES_QUEUE_NAME, Exchange(TECHNOLOGIES_QUEUE_NAME), routing_key=TECHNOLOGIES_QUEUE_NAME, ), - Queue( - GOOD_PRACTICES_QUEUE_NAME, - Exchange(GOOD_PRACTICES_QUEUE_NAME), - routing_key=GOOD_PRACTICES_QUEUE_NAME, - ), Queue( RESPONSIVENESS_QUEUE_NAME, Exchange(RESPONSIVENESS_QUEUE_NAME), @@ -53,27 +78,18 @@ class BaseConfig: Exchange(CARBON_QUEUE_NAME), routing_key=CARBON_QUEUE_NAME, ), - Queue( - UPLOAD_QUEUE_NAME, - Exchange(UPLOAD_QUEUE_NAME), - routing_key=UPLOAD_QUEUE_NAME, - ), ) task_routes = { "crawl": {"queue": CRAWL_QUEUE_NAME, "routing_key": CRAWL_QUEUE_NAME}, - "get_accessibility": { - "queue": ACCESSIBILITY_QUEUE_NAME, - "routing_key": ACCESSIBILITY_QUEUE_NAME, + "get_lighthouse": { + "queue": LIGHTHOUSE_QUEUE_NAME, + "routing_key": LIGHTHOUSE_QUEUE_NAME, }, "get_technologies": { "queue": TECHNOLOGIES_QUEUE_NAME, "routing_key": TECHNOLOGIES_QUEUE_NAME, }, - "get_good_practices": { - "queue": GOOD_PRACTICES_QUEUE_NAME, - "routing_key": GOOD_PRACTICES_QUEUE_NAME, - }, "get_responsiveness": { "queue": RESPONSIVENESS_QUEUE_NAME, "routing_key": RESPONSIVENESS_QUEUE_NAME, @@ -81,11 +97,7 @@ class BaseConfig: "get_carbon_footprint": { "queue": CARBON_QUEUE_NAME, "routing_key": CARBON_QUEUE_NAME, - }, - "upload_html": { - "queue": UPLOAD_QUEUE_NAME, - "routing_key": UPLOAD_QUEUE_NAME, - }, + } } def get(self, attribute_name: str): @@ -96,13 +108,12 @@ class DevelopmentConfig(BaseConfig): pass -@lru_cache() def get_settings(): config_cls_dict = { "development": DevelopmentConfig, "default": BaseConfig } - config_name = os.environ.get("CELERY_CONFIG", "default") + config_name = os.environ.get("CONFIG_PROFILE", "default") config_cls = config_cls_dict[config_name] return config_cls() diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index f1e6ec1..8f1020c 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -5,6 +5,8 @@ import os from pathlib import Path +from app.config import settings + from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware from scrapy.exceptions import IgnoreRequest from scrapy.extensions.closespider import CloseSpider @@ -46,14 +48,14 @@ def from_crawler(cls, crawler): def _format_file_path(self, response, spider) -> Path: domain = spider.allowed_domains[0] - base_file_path = f"/{os.environ['LOCAL_FILES_PATH'].strip('/')}/{spider.crawl_process.id}" + base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}" file_name = response.url.split(f"{domain}")[-1] if not file_name.endswith(".html"): file_name = f"{file_name}.html" if file_name == ".html": file_name = "index.html" return Path( - f"{base_file_path}/{os.environ['HTML_FOLDER_NAME'].strip('/')}/{file_name.lstrip('/')}" + f"{base_file_path}/{settings.HTML_FOLDER_NAME.strip('/')}/{file_name.lstrip('/')}" ) def _save_html_locally(self, response, spider): diff --git a/app/models/crawl.py b/app/models/crawl.py index 362e400..8dcac6d 100644 --- a/app/models/crawl.py +++ b/app/models/crawl.py @@ -5,7 +5,7 @@ from app.celery_broker.utils import french_datetime from app.models.enums import MetadataType, ProcessStatus -from app.models.metadata import MetadataConfig, AccessibilityModel, MetadataTask +from app.models.metadata import MetadataConfig, LighthouseModel, MetadataTask from app.models.utils import get_uuid, BaseTaskModel @@ -31,12 +31,10 @@ class CrawlModel(BaseModel): finished_at: datetime | None = None status: ProcessStatus = ProcessStatus.PENDING html_crawl: BaseTaskModel = Field(default_factory=BaseTaskModel) - accessibility: AccessibilityModel | None = None + lighthouse: LighthouseModel | None = None technologies_and_trackers: MetadataTask | None = None responsiveness: MetadataTask | None = None - good_practices: MetadataTask | None = None carbon_footprint: MetadataTask | None = None - uploads: BaseTaskModel = Field(default_factory=BaseTaskModel) @property def enabled_metadata(self) -> list[MetadataType]: @@ -47,14 +45,12 @@ def enabled_metadata(self) -> list[MetadataType]: ] def init_tasks(self) -> None: - if MetadataType.ACCESSIBILITY in self.enabled_metadata: - self.accessibility = AccessibilityModel() + if MetadataType.LIGHTHOUSE in self.enabled_metadata: + self.lighthouse = LighthouseModel() if MetadataType.TECHNOLOGIES in self.enabled_metadata: self.technologies_and_trackers = MetadataTask() if MetadataType.RESPONSIVENESS in self.enabled_metadata: self.responsiveness = MetadataTask() - if MetadataType.GOOD_PRACTICES in self.enabled_metadata: - self.good_practices = MetadataTask() if MetadataType.CARBON_FOOTPRINT in self.enabled_metadata: self.carbon_footprint = MetadataTask() diff --git a/app/models/enums.py b/app/models/enums.py index 3eb22c9..0b25960 100644 --- a/app/models/enums.py +++ b/app/models/enums.py @@ -2,10 +2,9 @@ class MetadataType(StrEnum): - ACCESSIBILITY = "accessibility" + LIGHTHOUSE = "lighthouse" TECHNOLOGIES = "technologies_and_trackers" RESPONSIVENESS = "responsiveness" - GOOD_PRACTICES = "good_practices" CARBON_FOOTPRINT = "carbon_footprint" diff --git a/app/models/metadata.py b/app/models/metadata.py index 3e6d3d7..89c9ff9 100644 --- a/app/models/metadata.py +++ b/app/models/metadata.py @@ -14,5 +14,5 @@ class MetadataTask(BaseTaskModel): pass -class AccessibilityModel(MetadataTask): +class LighthouseModel(MetadataTask): score: float | None = None diff --git a/app/models/request.py b/app/models/request.py index 15cd119..718ba4d 100644 --- a/app/models/request.py +++ b/app/models/request.py @@ -11,10 +11,9 @@ class UpdateWebsiteRequest(BaseModel): depth: int | None = None limit: int | None = None - accessibility: MetadataConfig | None = None + lighthouse: MetadataConfig | None = None technologies_and_trackers: MetadataConfig | None = None responsiveness: MetadataConfig | None = None - good_practices: MetadataConfig | None = None carbon_footprint: MetadataConfig | None = None headers: dict[str, Any] | None = None tags: list[str] | None = None @@ -26,16 +25,13 @@ class CreateWebsiteRequest(BaseModel): url: str depth: int = Field(ge=0, default=2) limit: int = Field(ge=0, default=400) - accessibility: MetadataConfig = Field(default=MetadataConfig()) + lighthouse: MetadataConfig = Field(default=MetadataConfig()) technologies_and_trackers: MetadataConfig = Field( default=MetadataConfig(enabled=False) ) responsiveness: MetadataConfig = Field( default=MetadataConfig(enabled=False) ) - good_practices: MetadataConfig = Field( - default=MetadataConfig(enabled=False) - ) carbon_footprint: MetadataConfig = Field( default=MetadataConfig(enabled=False) ) diff --git a/app/models/website.py b/app/models/website.py index 943d2fd..c024257 100644 --- a/app/models/website.py +++ b/app/models/website.py @@ -10,7 +10,6 @@ from app.models.metadata import MetadataConfig from app.models.utils import get_uuid -DEFAULT_RECRAWL_INTERVAL = os.environ.get("DEFAULT_RECRAWL_INTERVAL", 30) class WebsiteModel(BaseModel): @@ -18,10 +17,9 @@ class WebsiteModel(BaseModel): url: str depth: int limit: int - accessibility: MetadataConfig + lighthouse: MetadataConfig technologies_and_trackers: MetadataConfig responsiveness: MetadataConfig - good_practices: MetadataConfig carbon_footprint: MetadataConfig headers: dict[str, Any] created_at: datetime = Field(default_factory=french_datetime) @@ -36,10 +34,9 @@ def to_config(self) -> CrawlConfig: url=self.url, parameters=CrawlParameters(depth=self.depth, limit=self.limit), metadata_config={ - MetadataType.ACCESSIBILITY: self.accessibility, + MetadataType.LIGHTHOUSE: self.lighthouse, MetadataType.TECHNOLOGIES: self.technologies_and_trackers, MetadataType.RESPONSIVENESS: self.responsiveness, - MetadataType.GOOD_PRACTICES: self.good_practices, MetadataType.CARBON_FOOTPRINT: self.carbon_footprint, }, headers=self.headers, diff --git a/app/mongo.py b/app/mongo.py index 814f9f1..1a77f8f 100644 --- a/app/mongo.py +++ b/app/mongo.py @@ -1,14 +1,16 @@ import os +from app.config import settings from pymongo import MongoClient -client = MongoClient(host=os.environ["MONGO_URI"]) -db = client[os.environ["MONGO_DBNAME"]] -db[os.environ["MONGO_WEBSITES_COLLECTION"]].create_index( +client = MongoClient(host=settings.MONGO_URI) +db = client[settings.MONGO_DBNAME] + +db[settings.MONGO_WEBSITES_COLLECTION].create_index( [("id", 1)], unique=True ) -db[os.environ["MONGO_WEBSITES_COLLECTION"]].create_index( +db[settings.MONGO_WEBSITES_COLLECTION].create_index( [("url", 1)], unique=True ) -db[os.environ["MONGO_CRAWLS_COLLECTION"]].create_index([("id", 1)], unique=True) +db[settings.MONGO_CRAWLS_COLLECTION].create_index([("id", 1)], unique=True) diff --git a/app/repositories/__init__.py b/app/repositories/__init__.py index 36733b7..5e0f6b1 100644 --- a/app/repositories/__init__.py +++ b/app/repositories/__init__.py @@ -1,4 +1,5 @@ from .crawls import crawls from .websites import websites +from .files import files -__all__ = ["crawls", "websites"] +__all__ = ["crawls", "websites", "files"] diff --git a/app/repositories/crawls.py b/app/repositories/crawls.py index d2d5bb4..905bb35 100644 --- a/app/repositories/crawls.py +++ b/app/repositories/crawls.py @@ -1,8 +1,8 @@ import os - from pymongo.results import InsertOneResult from app.celery_broker.utils import french_datetime +from app.config import settings from app.models.crawl import CrawlModel, ListCrawlResponse from app.models.enums import ProcessStatus from app.models.metadata import MetadataTask @@ -13,7 +13,7 @@ class CrawlsRepository: """Operations for crawls collection""" def __init__(self): - self.collection = db[os.environ["MONGO_CRAWLS_COLLECTION"]] + self.collection = db[settings.MONGO_CRAWLS_COLLECTION] def create(self, data: CrawlModel) -> str: result: InsertOneResult = self.collection.insert_one(data.model_dump()) diff --git a/app/repositories/files.py b/app/repositories/files.py new file mode 100644 index 0000000..23f72ce --- /dev/null +++ b/app/repositories/files.py @@ -0,0 +1,48 @@ +import io +from zipfile import ZipFile, ZIP_DEFLATED +from app.s3 import with_s3 + + +class FileRepository: + """Operations for crawls collection""" + + @staticmethod + @with_s3 + def delete_all_crawl_files(s3, bucket, crawl_id): + """Delete all crawl's files from the storage service""" + objects = s3.list_objects(bucket, prefix=crawl_id, recursive=True) + for obj in objects: + s3.remove_object(bucket, obj.object_name) + return + + @staticmethod + @with_s3 + def store_html_file(s3, bucket, object_name, file_path, content_type): + """Store a crawl file in the storage service""" + return s3.fput_object(bucket, object_name=object_name, file_path=file_path, content_type=content_type) + + @staticmethod + @with_s3 + def store_metadata_file(s3, bucket, crawl_id, object_name, content_type, data): + """Store a crawl file in the storage service""" + object_path = f"{crawl_id}/metadata/{object_name}" + # Convert the string to bytes + data_bytes = data.encode('utf-8') + # Create a BytesIO object to make the bytes readable + data_stream = io.BytesIO(data_bytes) + return s3.put_object(bucket, object_name=object_path, length=len(data_bytes), content_type=content_type, data=data_stream) + + @staticmethod + @with_s3 + def zip_all_crawl_files(s3, bucket, crawl_id) -> ZipFile: + """Zip all crawl's files from the storage service""" + zip_io = io.BytesIO() + objects = s3.list_objects(bucket, prefix=crawl_id, recursive=True) + with ZipFile(zip_io, "a", ZIP_DEFLATED, False) as zipper: + for obj in objects: + file = s3.get_object(bucket, obj.object_name).read() + zipper.writestr(obj.object_name.strip(crawl_id), file) + return zip_io + + +files = FileRepository() diff --git a/app/repositories/websites.py b/app/repositories/websites.py index f8f0565..59f91d3 100644 --- a/app/repositories/websites.py +++ b/app/repositories/websites.py @@ -4,6 +4,7 @@ from pymongo.results import InsertOneResult, UpdateResult from app.celery_broker.utils import french_datetime +from app.config import settings from app.models.enums import ProcessStatus from app.models.request import UpdateWebsiteRequest from app.models.website import WebsiteModel, ListWebsiteResponse @@ -14,7 +15,7 @@ class WebsitesRepository: """Operations for websites collection""" def __init__(self): - self.collection = db[os.environ["MONGO_WEBSITES_COLLECTION"]] + self.collection = db[settings.MONGO_WEBSITES_COLLECTION] def list( self, diff --git a/app/s3.py b/app/s3.py new file mode 100644 index 0000000..e16d51d --- /dev/null +++ b/app/s3.py @@ -0,0 +1,29 @@ +import os +from functools import wraps +from minio import Minio + +from app.config import settings + +s3 = Minio( + endpoint=settings.STORAGE_SERVICE_URL, + access_key=settings.STORAGE_SERVICE_USERNAME, + secret_key=settings.STORAGE_SERVICE_PASSWORD, + secure=settings.STORAGE_SERVICE_SECURE, + region=settings.STORAGE_SERVICE_REGION, +) + +bucket = settings.STORAGE_SERVICE_BUCKET_NAME + +if not s3.bucket_exists(bucket): + s3.make_bucket(bucket) + + +def with_s3(f): + """Decorate a function for s3 connexion.""" + @wraps(f) + def wrapper(*args, **kwargs): + print(f"Calling {f.__name__} with s3 connexion", flush=True) + print(f"args: {','.join(map(str,args))}", flush=True) + response = f(s3, bucket, *args, **kwargs) + return response + return wrapper diff --git a/app/services/accessibility_best_practices_calculator.py b/app/services/accessibility_best_practices_calculator.py deleted file mode 100644 index 6676da3..0000000 --- a/app/services/accessibility_best_practices_calculator.py +++ /dev/null @@ -1,67 +0,0 @@ -import json -import subprocess -from enum import StrEnum -from typing import Any - - -class LighthouseCategories(StrEnum): - ACCESSIBILITY = "accessibility" - BEST_PRACTICES = "best-practices" - - -class LighthouseError(Exception): - pass - - -class AccessibilityError(Exception): - pass - - -class BestPracticesError(Exception): - pass - - -class LighthouseWrapper: - def get_accessibility(self, url: str) -> dict[str, Any]: - try: - result = self.get_categories( - url=url, categories=[LighthouseCategories.ACCESSIBILITY] - ) - except LighthouseError as e: - raise AccessibilityError from e - return result["accessibility"] - - def get_best_practices(self, url: str) -> dict[str, Any]: - try: - result = self.get_categories( - url=url, categories=[LighthouseCategories.BEST_PRACTICES] - ) - except LighthouseError as e: - raise BestPracticesError from e - return result["best-practices"] - - def get_categories( - self, url: str, categories: list[LighthouseCategories] - ) -> dict[str, Any]: - try: - lighthouse_process = subprocess.run( - " ".join( - [ - "lighthouse", - url, - '--chrome-flags="--no-sandbox --headless --disable-dev-shm-usage"', - f"--only-categories={','.join(categories)}", - "--output=json", - "--disable-full-page-screenshot", - "--no-enable-error-reporting", - "--quiet", - ] - ), - stdout=subprocess.PIPE, - shell=True, - ) - lighthouse_response = json.loads(lighthouse_process.stdout) - result = lighthouse_response["categories"] - except Exception as e: - raise LighthouseError from e - return result diff --git a/app/services/lighthouse_calculator.py b/app/services/lighthouse_calculator.py new file mode 100644 index 0000000..4365a67 --- /dev/null +++ b/app/services/lighthouse_calculator.py @@ -0,0 +1,32 @@ +import json +import subprocess +from typing import Any + + +class LighthouseError(Exception): + pass + + +class LighthouseCalculator: + def get_lighthouse(self, url: str) -> dict[str, Any]: + try: + lighthouse_process = subprocess.run( + " ".join( + [ + "lighthouse", + url, + '--chrome-flags="--no-sandbox --headless --disable-dev-shm-usage"', + "--output=json", + "--disable-full-page-screenshot", + "--no-enable-error-reporting", + "--quiet", + ] + ), + stdout=subprocess.PIPE, + shell=True, + ) + lighthouse_response = json.loads(lighthouse_process.stdout) + result = lighthouse_response + except Exception as e: + raise LighthouseError from e + return result diff --git a/app/services/responsiveness_calculator.py b/app/services/responsiveness_calculator.py index a773dfe..4fcb744 100644 --- a/app/services/responsiveness_calculator.py +++ b/app/services/responsiveness_calculator.py @@ -3,6 +3,8 @@ import requests +from app.config import settings + class ResponsivenessCalculatorError(Exception): pass @@ -11,7 +13,7 @@ class ResponsivenessCalculatorError(Exception): class ResponsivenessCalculator: def __init__(self): self.base_url = "https://content-searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run" - self._api_key = os.environ["GOOGLE_API_KEY"] + self._api_key = settings.GOOGLE_API_KEY def get_responsiveness(self, url: str) -> dict[str, Any]: response = None diff --git a/client/index.html b/client/index.html index 53b3037..5d28e73 100644 --- a/client/index.html +++ b/client/index.html @@ -3,14 +3,21 @@
- + -