feat(api): add identifiers field

dataesr · Feb 19, 2024 · 0381cb7 · 0381cb7
1 parent 1286345
commit 0381cb7
Show file tree

Hide file tree

Showing 68 changed files with 1,042 additions and 1,415 deletions.
diff --git a/.env b/.env
@@ -10,7 +10,7 @@ MONGODB_PATH=/mounted/mongodb/ # TODO: Change value before build
 GOOGLE_API_KEY=AIzaSyAfdZFZM1mz7IYUgCpESSJX4zdJZ589eX0
 
 # Scrapy
-SCRAPY_SETTINGS_MODULE=app.crawler.settings
+SCRAPY_SETTINGS_MODULE=app.tasks.html_crawl.settings
 C_FORCE_ROOT=True
 
 CELERY_BROKER_URL=redis://redis:6379

diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml
@@ -106,13 +106,6 @@ jobs:
         with:
           namespace: ${{ env.DEPLOYMENT_NAMESPACE }}
           restart: oc-client
-      - name: Restart oc-responsiveness-worker deployment
-        uses: dataesr/[email protected]
-        env:
-          KUBE_CONFIG: ${{ secrets.KUBE_CONFIG_DOAD_STAGING }}
-        with:
-          namespace: ${{ env.DEPLOYMENT_NAMESPACE }}
-          restart: oc-responsiveness-worker
       - name: Restart oc-technologies-worker deployment
         uses: dataesr/[email protected]
         env:

diff --git a/README.md b/README.md
@@ -8,12 +8,6 @@ Make sure you have installed :
 * git
 * docker-compose
 
-A Google Cloud API KEY is requested in order for the responsiveness metadata to work properly:
-https://developers.google.com/webmaster-tools/search-console-api/v1/configure?hl=fr
-
-Once you created the API KEY, you need to store its value in `GOOGLE_API_KEY` in the .env file.
-
-You also need to activate Google Search Console API: https://console.cloud.google.com/apis/dashboard
 ## Installation
 
 Git clone project
@@ -92,7 +86,6 @@ This endpoint allows you to create a new website configuration end execute a cra
 | `headers`          | `dict[str, str]` | Headers that will be passed to all crawl requests (**Default**: {}) |
 | `lighthouse`       | `MetadataConfig` | Lighthouse configuration (**Default**: {'enabled':True, 'depth' 0}) |
 | `technologies`     | `MetadataConfig` | Technologies configuration (**Default**: {'enabled': False})        |
-| `responsiveness`   | `MetadataConfig` | Responsiveness configuration (**Default**: {'enabled': False})      |
 | `carbon_footprint` | `MetadataConfig` | Carbon Footprint configuration (**Default**: {'enabled': False})    |
 | `tags`             | `list[str]`      | List of tags to associate to this website (**Default**: [])         |
 | `crawl_every`      | `integer`        | Time to wait between each crawl (In days, >= 0, **Default**: 30)    |

diff --git a/app/api/__init__.py b/app/api/__init__.py
@@ -0,0 +1,4 @@
+from app.api.crawls import crawls_router
+from app.api.websites import websites_router
+
+__all__ = ["crawls_router", "websites_router"]
diff --git a/app/api/crawls_router.py → app/api/crawls.py b/app/api/crawls_router.py → app/api/crawls.py
@@ -4,7 +4,7 @@
 from app.repositories.crawls import crawls
 from app.repositories.files import files
 from app.repositories.websites import websites
-from app.api.utils import create_crawl, start_crawl
+from app.api.utils import start_crawl
 from app.models.crawl import CrawlModel, ListCrawlResponse
 
 crawls_router = APIRouter(
@@ -22,7 +22,8 @@
 )
 def crawl_website(website_id: str):
     if website := websites.get(website_id):
-        crawl = create_crawl(website)
+        crawl = website.to_crawl()
+        crawls.create(crawl)
         start_crawl(crawl)
         websites.refresh_next_crawl(crawl.website_id)
         return crawl

diff --git a/app/api/factory.py b/app/api/factory.py
diff --git a/app/api/main.py b/app/api/main.py
diff --git a/app/api/utils.py b/app/api/utils.py
@@ -1,53 +1,39 @@
-from urllib.parse import urlparse
-
 from celery import group, chain, chord
-
-from app.repositories.crawls import crawls
-from app.celery_broker.tasks import (
-    METADATA_TASK_REGISTRY,
-    start_crawl_process, finalize_crawl_process,
+from app.models.enums import MetadataType
+
+from app.tasks import (
+    get_lighthouse,
+    get_technologies,
+    get_carbon_footprint,
+    get_html_crawl,
+    finalize_crawl_process,
 )
-from app.models.crawl import CrawlModel
-from app.models.website import WebsiteModel
-from app.services.crawler_logger import logger
-
-
-def create_crawl(website: WebsiteModel) -> CrawlModel:
 
-    # Check if the path component of the URL is empty or "/"
-    # If the crawl target is a single page, we will ignore the depth and the limit in the request.
-    if not is_domain(website.url):
-        website.depth = 0
-        website.limit = 1
-        logger.warning("The url to crawl is not a domain. Only one page will be crawled")
+from app.models.crawl import CrawlModel
+from app.services.logging import logger
 
-    crawl: CrawlModel = CrawlModel(
-        website_id=website.id,
-        config=website.to_config(),
-    )
-    crawl.init_tasks()
-    crawls.create(crawl)
-    return crawl
+METADATA_TASK_REGISTRY = {
+    MetadataType.LIGHTHOUSE: get_lighthouse,
+    MetadataType.TECHNOLOGIES: get_technologies,
+    MetadataType.CARBON_FOOTPRINT: get_carbon_footprint,
+}
 
 
 def start_crawl(crawl: CrawlModel) -> None:
+    logger.info(f"New crawl process ({crawl.id}) for website {crawl.url}")
     logger.info(
-        f"New crawl process ({crawl.id}) for website {crawl.config.url}"
+        f"New crawl process ({crawl.id}) for website {crawl.url}"
     )
     metadata_tasks = group(
-        METADATA_TASK_REGISTRY.get(metadata).s()
+        METADATA_TASK_REGISTRY.get(metadata).si(crawl.id)
         for metadata in crawl.enabled_metadata
     )
     # If a task in a chain fails, the remaining tasks in the chain will not be executed.
     # To ensure that `finalize_crawl` is executed regardless of whether the previous tasks in the chain fail or succeed,
     # We need to put it in the `link_error` callback in start_crawl_process and do a chord with the metadata tasks.
     chain(
-        start_crawl_process.s(crawl).on_error(finalize_crawl_process.s(crawl)),
-        chord(metadata_tasks, finalize_crawl_process.s(crawl)),
+        get_html_crawl.si(crawl.id).on_error(
+            finalize_crawl_process.si(crawl.id)),
+        chord(metadata_tasks, finalize_crawl_process.si(crawl.id)),
+        finalize_crawl_process.si(crawl.id)
     ).apply_async(task_id=crawl.id)
-
-
-def is_domain(url: str) -> bool:
-    parsed_url = urlparse(url)
-    return parsed_url.path == '' or parsed_url.path == '/'
-
diff --git a/app/api/websites_router.py → app/api/websites.py b/app/api/websites_router.py → app/api/websites.py
@@ -2,12 +2,14 @@
 from pymongo.errors import DuplicateKeyError
 from fastapi.responses import StreamingResponse
 
-
+from app.repositories.crawls import crawls
 from app.repositories.websites import websites
 from app.repositories.files import files
-from app.api.utils import create_crawl, start_crawl
-from app.models.request import UpdateWebsiteRequest, CreateWebsiteRequest
-from app.models.website import WebsiteModel, ListWebsiteResponse
+from app.api.utils import start_crawl
+from app.models.website import (
+    WebsiteModel, ListWebsiteResponse,
+    UpdateWebsiteRequest, CreateWebsiteRequest
+)
 
 websites_router = APIRouter(
     prefix="/api/websites",
@@ -33,7 +35,8 @@ def create_website(data: CreateWebsiteRequest):
             detail="Website already exists.",
         ) from e
 
-    crawl = create_crawl(website)
+    crawl = website.to_crawl()
+    crawls.create(crawl)
     start_crawl(crawl)
     return website
 
@@ -49,11 +52,18 @@ def list_websites(
     skip: int = 0,
     limit: int = 10,
     tags: str | None = None,
+    identifiers: str | None = None,
     status: str | None = None,
     sort: str = "created_at",
 ):
     return websites.list(
-        query=query, tags=tags, status=status, skip=skip, limit=limit, sort=sort
+        query=query,
+        tags=tags,
+        identifiers=identifiers,
+        status=status,
+        skip=skip,
+        limit=limit,
+        sort=sort
     )
 
 
@@ -75,7 +85,6 @@ def get_website(website_id: str):
 
 @websites_router.get(
     "/{website_id}/files",
-    response_model=WebsiteModel,
     status_code=statuscode.HTTP_200_OK,
     summary="Get a website's zip of last crawl files by its unique ID",
 )
@@ -87,7 +96,34 @@ def get_website_files(website_id: str) -> StreamingResponse:
                 iter([zip_io.getvalue()]),
                 media_type="application/x-zip-compressed",
                 headers={
-                    "Content-Disposition": f"attachment; filename={website_id}.zip"
+                    "Content-Disposition": f"attachment; filename={last_crawl_id}.zip"
+                },
+            )
+    raise HTTPException(
+        status_code=statuscode.HTTP_404_NOT_FOUND,
+        detail="Website not found",
+    )
+
+
+@websites_router.get(
+    "/files/{identifiers_id}",
+    status_code=statuscode.HTTP_200_OK,
+    summary="Get a website's zip of last crawl files by identifiers",
+)
+def get_website_files(identifiers_id: str) -> StreamingResponse:
+    if data := websites.list(
+        query=None,
+        tags=None,
+        status=None,
+        identifiers=identifiers_id
+    ).data[0]:
+        if last_crawl_id := data.last_crawl.get("id"):
+            zip_io = files.zip_all_crawl_files(last_crawl_id)
+            return StreamingResponse(
+                iter([zip_io.getvalue()]),
+                media_type="application/x-zip-compressed",
+                headers={
+                    "Content-Disposition": f"attachment; filename={last_crawl_id}.zip"
                 },
             )
     raise HTTPException(
@@ -130,7 +166,7 @@ def delete_website(website_id: str):
 )
 def recrawl_cron():
     for website in websites.list_to_recrawl().data:
-        crawl = create_crawl(website)
+        crawl = website.to_crawl()
+        crawls.create(crawl)
         start_crawl(crawl)
         websites.refresh_next_crawl(crawl.website_id)
-        return crawl
diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py
diff --git a/app/celery_broker/factory.py b/app/celery_broker/factory.py
diff --git a/app/celery_broker/main.py b/app/celery_broker/main.py