Merge pull request #28 from dataesr/dev

Dev
dataesr · Nov 16, 2023 · 1e0f2fd · 1e0f2fd
2 parents e133ab4 + 308e63a
commit 1e0f2fd
Show file tree

Hide file tree

Showing 54 changed files with 579 additions and 536 deletions.
diff --git a/.env b/.env
@@ -1,14 +1,15 @@
 MODE=dev
-#Volumes Config
+CONFIG_PROFILE=development
+
+# Volumes Config
 LOCAL_FILES_PATH=/mounted/local_files/ # TODO: Change value before build
 MINIO_PATH=/mounted/minio/ # TODO: Change value before build
 MONGODB_PATH=/mounted/mongodb/ # TODO: Change value before build
 
-#Storage Service Config
-
-#API_KEYS
-GOOGLE_API_KEY=CHANGEME
+# API_KEYS
+GOOGLE_API_KEY=AIzaSyAfdZFZM1mz7IYUgCpESSJX4zdJZ589eX0
 
+# Scrapy
 SCRAPY_SETTINGS_MODULE=app.crawler.settings
 C_FORCE_ROOT=True
 
@@ -24,10 +25,9 @@ STORAGE_SERVICE_USERNAME=admin
 STORAGE_SERVICE_PASSWORD=password123
 STORAGE_SERVICE_URL=minio:9000
 STORAGE_SERVICE_REGION=gra
+STORAGE_SERVICE_SECURE=false
 STORAGE_SERVICE_BUCKET_NAME=open-crawler
 HTML_FOLDER_NAME=html # TODO: Change value before build
 METADATA_FOLDER_NAME=metadata # TODO: Change value before build
 
-LOGGER_LEVEL=INFO
-
 DEFAULT_RECRAWL_INTERVAL=30
diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml
@@ -20,6 +20,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
+      - name : Install Packages 
+        run : pip install -r requirements.txt
+
       - name: test
         run: python -m unittest
 

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,22 @@
+name: Testing deployment
+
+on:
+  push:
+
+jobs:
+  unit-test:
+    name: run unit tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.11
+
+      - name : Install Packages 
+        run : pip install -r requirements.txt
+
+      - name: test
+        run: python -m unittest
diff --git a/README.md b/README.md
@@ -89,8 +89,7 @@ This endpoint allows you to create a new website configuration end execute a cra
 | `depth` | `integer` | Maximum depth to crawl (**Default**: 2) |
 | `limit` | `integer` | Maximum pages to crawl (**Default**: 400) |
 | `headers` | `dict[str, str]` | Headers that will be passed to all crawl requests (**Default**: {})|
-| `accessibility` | `MetadataConfig` | Accessibility configuration (**Default**: {'enabled':True, 'depth' 0}) |
-| `good_practices` | `MetadataConfig` | Good Practices configuration (**Default**: {'enabled': False}) |
+| `lighthouse` | `MetadataConfig` | Lighthouse configuration (**Default**: {'enabled':True, 'depth' 0}) |
 | `technologies` | `MetadataConfig` | Technologies configuration (**Default**: {'enabled': False}) |
 | `responsiveness` | `MetadataConfig` | Responsiveness configuration (**Default**: {'enabled': False}) |
 | `carbon_footprint` | `MetadataConfig` | Carbon Footprint configuration (**Default**: {'enabled': False}) |
@@ -132,7 +131,9 @@ To access the two collections, use a MongoDB console (such as MongoDB Compass fo
 **website_crawl_parameters** collection:
 ![mongodb_config](./demo/mongodb_crawl_configuration.png)## Acces simple storage service
 
-At the end of the crawl process, all crawled html pages and metadata files are uploaded to a simple storage service (s3).
+At the end of the crawl process, all crawled html pages are uploaded to a simple storage service (s3).
+The metadata are directly uploaded to the storage service.
+
 The docker-compose file deploys a MinIO service that can be accessed at http://localhost:9090. (by default)
 
 ![minio](./demo/minio.png)
diff --git a/app/api/crawls_router.py b/app/api/crawls_router.py
@@ -1,10 +1,5 @@
-import io
-import os
-from zipfile import ZipFile, ZIP_DEFLATED
-
 from fastapi import HTTPException, APIRouter, status as statuscode
 from fastapi.responses import StreamingResponse
-from minio import Minio
 
 import app.repositories as repositories
 from app.api.utils import create_crawl, start_crawl
@@ -58,34 +53,23 @@ def list_crawls(
     status_code=statuscode.HTTP_200_OK,
     summary="Get a zip of all files from a crawl",
 )
-def get_crawl_files(website_id: str, crawl_id: str) -> StreamingResponse:
+def get_crawl_files(crawl_id: str) -> StreamingResponse:
     """Zip the files from the storage service"""
-    client = Minio(
-        endpoint=os.environ["STORAGE_SERVICE_URL"],
-        access_key=os.environ["STORAGE_SERVICE_USERNAME"],
-        secret_key=os.environ["STORAGE_SERVICE_PASSWORD"],
-        secure=os.environ.get("STORAGE_SERVICE_SECURE", False),
-        region=os.environ.get("STORAGE_SERVICE_REGION", None),
-    )
-
-    bucket = os.environ["STORAGE_SERVICE_BUCKET_NAME"]
-    zip_io = io.BytesIO()
-    if not (crawl := repositories.crawls.get(website_id, crawl_id)):
-        raise HTTPException(
-            status_code=statuscode.HTTP_404_NOT_FOUND,
-            detail="Crawl not found",
-        )
-    url = crawl.config.url.replace("https://", "").replace("http://", "")
-    prefix = f"{url}/{crawl_id}"
-    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
-    with ZipFile(zip_io, "a", ZIP_DEFLATED, False) as zipper:
-        for obj in objects:
-            file = client.get_object(bucket, obj.object_name).read()
-            zipper.writestr(obj.object_name, file)
+    zip_io = repositories.files.zip_all_crawl_files(crawl_id)
     return StreamingResponse(
         iter([zip_io.getvalue()]),
         media_type="application/x-zip-compressed",
         headers={
-            "Content-Disposition": f"attachment; filename={url}-{crawl_id}.zip"
+            "Content-Disposition": f"attachment; filename={crawl_id}.zip"
         },
     )
+
+
+@crawls_router.delete(
+    "/{website_id}/crawls/{crawl_id}",
+    status_code=statuscode.HTTP_204_NO_CONTENT,
+    summary="Delete a crawl",
+)
+def delete_crawl(crawl_id: str) -> None:
+    """Zip the files from the storage service"""
+    return repositories.files.delete_all_crawl_files(crawl_id)
diff --git a/app/api/factory.py b/app/api/factory.py
@@ -5,6 +5,7 @@
 
 from app.api.crawls_router import crawls_router
 from app.api.websites_router import websites_router
+from app.config import settings
 
 
 def create_api_app() -> FastAPI:
@@ -18,7 +19,7 @@ def create_api_app() -> FastAPI:
     )
 
     # Configure CORS for non-production modes
-    deployment_mode = os.environ.get("MODE", "production")
+    deployment_mode = settings.MODE
     if deployment_mode != "production":
         api_app.add_middleware(
             CORSMiddleware,
@@ -27,6 +28,7 @@ def create_api_app() -> FastAPI:
             allow_methods=["*"],
             allow_headers=["*"],
         )
+    # TODO: Configure CORS for production mode
 
     api_app.include_router(websites_router)
     api_app.include_router(crawls_router)

diff --git a/app/api/utils.py b/app/api/utils.py
@@ -1,17 +1,27 @@
+from urllib.parse import urlparse
+
 from celery import group, chain
 
 import app.repositories as repositories
 from app.celery_broker.tasks import (
     METADATA_TASK_REGISTRY,
     start_crawl_process,
-    upload_html,
 )
 from app.models.crawl import CrawlModel
+from app.models.enums import ProcessStatus
 from app.models.website import WebsiteModel
 from app.services.crawler_logger import logger
 
 
 def create_crawl(website: WebsiteModel) -> CrawlModel:
+
+    # Check if the path component of the URL is empty or "/"
+    # If the crawl target is a single page, we will ignore the depth and the limit in the request.
+    if not is_domain(website.url):
+        website.depth = 0
+        website.limit = 1
+        logger.warning("The url to crawl is not a domain. Only one page will be crawled")
+
     crawl: CrawlModel = CrawlModel(
         website_id=website.id,
         config=website.to_config(),
@@ -32,5 +42,10 @@ def start_crawl(crawl: CrawlModel) -> None:
     chain(
         start_crawl_process.s(crawl),
         metadata_tasks,
-        upload_html.si(crawl),
     ).apply_async(task_id=crawl.id)
+
+
+def is_domain(url: str) -> bool:
+    parsed_url = urlparse(url)
+    return parsed_url.path == '' or parsed_url.path == '/'
+
diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py
@@ -32,3 +32,14 @@ def start_crawler_process(crawl_process: CrawlProcess, results: dict):
     process.crawl(MenesrSpider, crawl_process=crawl_process)
     process.start()
     results["metadata"] = dict(crawl_process.metadata.items())
+
+
+def set_html_crawl_status(crawl: CrawlModel, request_id: str, status: ProcessStatus):
+    crawl.html_crawl.update(
+        task_id=request_id, status=status
+    )
+    repositories.crawls.update_task(
+        crawl_id=crawl.id,
+        task_name="html_crawl",
+        task=crawl.html_crawl,
+    )
diff --git a/app/celery_broker/factory.py b/app/celery_broker/factory.py
@@ -2,14 +2,14 @@
 
 from celery import Celery
 
-from app.celery_broker.config import settings
+from app.config import settings
 
 
 def create_celery_app() -> Celery:
     celery_app = Celery(
         "scanr",
-        broker=os.environ.get("CELERY_BROKER_URL"),
-        backend=os.environ.get("CELERY_RESULT_BACKEND"),
+        broker=settings.CELERY_BROKER_URL,
+        backend=settings.CELERY_RESULT_BACKEND,
         broker_connection_retry_on_startup=True,
         include=["app.celery_broker.tasks"],
     )

diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py
@@ -6,10 +6,7 @@
 from app.models.enums import MetadataType, ProcessStatus
 from app.models.metadata import MetadataTask
 from app.models.process import CrawlProcess
-from app.services.accessibility_best_practices_calculator import (
-    AccessibilityError,
-    BestPracticesError,
-)
+from app.services.lighthouse_calculator import LighthouseError
 from app.services.carbon_calculator import CarbonCalculatorError
 from app.services.crawler_logger import logger
 from app.services.responsiveness_calculator import ResponsivenessCalculatorError
@@ -46,14 +43,12 @@ def handle_metadata_result(
 def store_metadata_result(
     crawl_process: CrawlProcess, result: dict, metadata_type: MetadataType
 ):
-    base_file_path = (
-        f"/{os.environ['LOCAL_FILES_PATH'].strip('/')}/{crawl_process.id}"
-    )
-    file_path = pathlib.Path(
-        f"{base_file_path}/{os.environ['METADATA_FOLDER_NAME'].strip('/')}/{metadata_type}.json"
+    return repositories.files.store_metadata_file(
+        crawl_id=crawl_process.id,
+        object_name=f"{metadata_type}.json",
+        content_type='application/json',
+        data=json.dumps(result, indent=2, default=str)
     )
-    file_path.parent.mkdir(exist_ok=True, parents=True)
-    file_path.write_text(json.dumps(result, indent=4))
 
 
 def metadata_task(
@@ -78,8 +73,7 @@ def metadata_task(
                 data = calc_method(url)
                 result[url] = data
             except (
-                AccessibilityError,
-                BestPracticesError,
+                LighthouseError,
                 TechnologiesError,
                 ResponsivenessCalculatorError,
                 CarbonCalculatorError,