Skip to content

Commit

Permalink
feat(api): add identifiers field
Browse files Browse the repository at this point in the history
  • Loading branch information
folland87 committed Feb 19, 2024
1 parent 1286345 commit 0381cb7
Show file tree
Hide file tree
Showing 68 changed files with 1,042 additions and 1,415 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ MONGODB_PATH=/mounted/mongodb/ # TODO: Change value before build
GOOGLE_API_KEY=AIzaSyAfdZFZM1mz7IYUgCpESSJX4zdJZ589eX0

# Scrapy
SCRAPY_SETTINGS_MODULE=app.crawler.settings
SCRAPY_SETTINGS_MODULE=app.tasks.html_crawl.settings
C_FORCE_ROOT=True

CELERY_BROKER_URL=redis://redis:6379
Expand Down
7 changes: 0 additions & 7 deletions .github/workflows/staging.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,6 @@ jobs:
with:
namespace: ${{ env.DEPLOYMENT_NAMESPACE }}
restart: oc-client
- name: Restart oc-responsiveness-worker deployment
uses: dataesr/[email protected]
env:
KUBE_CONFIG: ${{ secrets.KUBE_CONFIG_DOAD_STAGING }}
with:
namespace: ${{ env.DEPLOYMENT_NAMESPACE }}
restart: oc-responsiveness-worker
- name: Restart oc-technologies-worker deployment
uses: dataesr/[email protected]
env:
Expand Down
7 changes: 0 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ Make sure you have installed :
* git
* docker-compose

A Google Cloud API KEY is requested in order for the responsiveness metadata to work properly:
https://developers.google.com/webmaster-tools/search-console-api/v1/configure?hl=fr

Once you created the API KEY, you need to store its value in `GOOGLE_API_KEY` in the .env file.

You also need to activate Google Search Console API: https://console.cloud.google.com/apis/dashboard
## Installation

Git clone project
Expand Down Expand Up @@ -92,7 +86,6 @@ This endpoint allows you to create a new website configuration end execute a cra
| `headers` | `dict[str, str]` | Headers that will be passed to all crawl requests (**Default**: {}) |
| `lighthouse` | `MetadataConfig` | Lighthouse configuration (**Default**: {'enabled':True, 'depth' 0}) |
| `technologies` | `MetadataConfig` | Technologies configuration (**Default**: {'enabled': False}) |
| `responsiveness` | `MetadataConfig` | Responsiveness configuration (**Default**: {'enabled': False}) |
| `carbon_footprint` | `MetadataConfig` | Carbon Footprint configuration (**Default**: {'enabled': False}) |
| `tags` | `list[str]` | List of tags to associate to this website (**Default**: []) |
| `crawl_every` | `integer` | Time to wait between each crawl (In days, >= 0, **Default**: 30) |
Expand Down
4 changes: 4 additions & 0 deletions app/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from app.api.crawls import crawls_router
from app.api.websites import websites_router

__all__ = ["crawls_router", "websites_router"]
5 changes: 3 additions & 2 deletions app/api/crawls_router.py → app/api/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from app.repositories.crawls import crawls
from app.repositories.files import files
from app.repositories.websites import websites
from app.api.utils import create_crawl, start_crawl
from app.api.utils import start_crawl
from app.models.crawl import CrawlModel, ListCrawlResponse

crawls_router = APIRouter(
Expand All @@ -22,7 +22,8 @@
)
def crawl_website(website_id: str):
if website := websites.get(website_id):
crawl = create_crawl(website)
crawl = website.to_crawl()
crawls.create(crawl)
start_crawl(crawl)
websites.refresh_next_crawl(crawl.website_id)
return crawl
Expand Down
33 changes: 0 additions & 33 deletions app/api/factory.py

This file was deleted.

8 changes: 0 additions & 8 deletions app/api/main.py

This file was deleted.

58 changes: 22 additions & 36 deletions app/api/utils.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,39 @@
from urllib.parse import urlparse

from celery import group, chain, chord

from app.repositories.crawls import crawls
from app.celery_broker.tasks import (
METADATA_TASK_REGISTRY,
start_crawl_process, finalize_crawl_process,
from app.models.enums import MetadataType

from app.tasks import (
get_lighthouse,
get_technologies,
get_carbon_footprint,
get_html_crawl,
finalize_crawl_process,
)
from app.models.crawl import CrawlModel
from app.models.website import WebsiteModel
from app.services.crawler_logger import logger


def create_crawl(website: WebsiteModel) -> CrawlModel:

# Check if the path component of the URL is empty or "/"
# If the crawl target is a single page, we will ignore the depth and the limit in the request.
if not is_domain(website.url):
website.depth = 0
website.limit = 1
logger.warning("The url to crawl is not a domain. Only one page will be crawled")
from app.models.crawl import CrawlModel
from app.services.logging import logger

crawl: CrawlModel = CrawlModel(
website_id=website.id,
config=website.to_config(),
)
crawl.init_tasks()
crawls.create(crawl)
return crawl
METADATA_TASK_REGISTRY = {
MetadataType.LIGHTHOUSE: get_lighthouse,
MetadataType.TECHNOLOGIES: get_technologies,
MetadataType.CARBON_FOOTPRINT: get_carbon_footprint,
}


def start_crawl(crawl: CrawlModel) -> None:
logger.info(f"New crawl process ({crawl.id}) for website {crawl.url}")
logger.info(
f"New crawl process ({crawl.id}) for website {crawl.config.url}"
f"New crawl process ({crawl.id}) for website {crawl.url}"
)
metadata_tasks = group(
METADATA_TASK_REGISTRY.get(metadata).s()
METADATA_TASK_REGISTRY.get(metadata).si(crawl.id)
for metadata in crawl.enabled_metadata
)
# If a task in a chain fails, the remaining tasks in the chain will not be executed.
# To ensure that `finalize_crawl` is executed regardless of whether the previous tasks in the chain fail or succeed,
# We need to put it in the `link_error` callback in start_crawl_process and do a chord with the metadata tasks.
chain(
start_crawl_process.s(crawl).on_error(finalize_crawl_process.s(crawl)),
chord(metadata_tasks, finalize_crawl_process.s(crawl)),
get_html_crawl.si(crawl.id).on_error(
finalize_crawl_process.si(crawl.id)),
chord(metadata_tasks, finalize_crawl_process.si(crawl.id)),
finalize_crawl_process.si(crawl.id)
).apply_async(task_id=crawl.id)


def is_domain(url: str) -> bool:
parsed_url = urlparse(url)
return parsed_url.path == '' or parsed_url.path == '/'

56 changes: 46 additions & 10 deletions app/api/websites_router.py → app/api/websites.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from pymongo.errors import DuplicateKeyError
from fastapi.responses import StreamingResponse


from app.repositories.crawls import crawls
from app.repositories.websites import websites
from app.repositories.files import files
from app.api.utils import create_crawl, start_crawl
from app.models.request import UpdateWebsiteRequest, CreateWebsiteRequest
from app.models.website import WebsiteModel, ListWebsiteResponse
from app.api.utils import start_crawl
from app.models.website import (
WebsiteModel, ListWebsiteResponse,
UpdateWebsiteRequest, CreateWebsiteRequest
)

websites_router = APIRouter(
prefix="/api/websites",
Expand All @@ -33,7 +35,8 @@ def create_website(data: CreateWebsiteRequest):
detail="Website already exists.",
) from e

crawl = create_crawl(website)
crawl = website.to_crawl()
crawls.create(crawl)
start_crawl(crawl)
return website

Expand All @@ -49,11 +52,18 @@ def list_websites(
skip: int = 0,
limit: int = 10,
tags: str | None = None,
identifiers: str | None = None,
status: str | None = None,
sort: str = "created_at",
):
return websites.list(
query=query, tags=tags, status=status, skip=skip, limit=limit, sort=sort
query=query,
tags=tags,
identifiers=identifiers,
status=status,
skip=skip,
limit=limit,
sort=sort
)


Expand All @@ -75,7 +85,6 @@ def get_website(website_id: str):

@websites_router.get(
"/{website_id}/files",
response_model=WebsiteModel,
status_code=statuscode.HTTP_200_OK,
summary="Get a website's zip of last crawl files by its unique ID",
)
Expand All @@ -87,7 +96,34 @@ def get_website_files(website_id: str) -> StreamingResponse:
iter([zip_io.getvalue()]),
media_type="application/x-zip-compressed",
headers={
"Content-Disposition": f"attachment; filename={website_id}.zip"
"Content-Disposition": f"attachment; filename={last_crawl_id}.zip"
},
)
raise HTTPException(
status_code=statuscode.HTTP_404_NOT_FOUND,
detail="Website not found",
)


@websites_router.get(
"/files/{identifiers_id}",
status_code=statuscode.HTTP_200_OK,
summary="Get a website's zip of last crawl files by identifiers",
)
def get_website_files(identifiers_id: str) -> StreamingResponse:
if data := websites.list(
query=None,
tags=None,
status=None,
identifiers=identifiers_id
).data[0]:
if last_crawl_id := data.last_crawl.get("id"):
zip_io = files.zip_all_crawl_files(last_crawl_id)
return StreamingResponse(
iter([zip_io.getvalue()]),
media_type="application/x-zip-compressed",
headers={
"Content-Disposition": f"attachment; filename={last_crawl_id}.zip"
},
)
raise HTTPException(
Expand Down Expand Up @@ -130,7 +166,7 @@ def delete_website(website_id: str):
)
def recrawl_cron():
for website in websites.list_to_recrawl().data:
crawl = create_crawl(website)
crawl = website.to_crawl()
crawls.create(crawl)
start_crawl(crawl)
websites.refresh_next_crawl(crawl.website_id)
return crawl
51 changes: 0 additions & 51 deletions app/celery_broker/crawler_utils.py

This file was deleted.

27 changes: 0 additions & 27 deletions app/celery_broker/factory.py

This file was deleted.

8 changes: 0 additions & 8 deletions app/celery_broker/main.py

This file was deleted.

Loading

0 comments on commit 0381cb7

Please sign in to comment.