Skip to content

Commit

Permalink
Merge pull request #28 from dataesr/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
folland87 authored Nov 16, 2023
2 parents e133ab4 + 308e63a commit 1e0f2fd
Show file tree
Hide file tree
Showing 54 changed files with 579 additions and 536 deletions.
14 changes: 7 additions & 7 deletions .env
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
MODE=dev
#Volumes Config
CONFIG_PROFILE=development

# Volumes Config
LOCAL_FILES_PATH=/mounted/local_files/ # TODO: Change value before build
MINIO_PATH=/mounted/minio/ # TODO: Change value before build
MONGODB_PATH=/mounted/mongodb/ # TODO: Change value before build

#Storage Service Config

#API_KEYS
GOOGLE_API_KEY=CHANGEME
# API_KEYS
GOOGLE_API_KEY=AIzaSyAfdZFZM1mz7IYUgCpESSJX4zdJZ589eX0

# Scrapy
SCRAPY_SETTINGS_MODULE=app.crawler.settings
C_FORCE_ROOT=True

Expand All @@ -24,10 +25,9 @@ STORAGE_SERVICE_USERNAME=admin
STORAGE_SERVICE_PASSWORD=password123
STORAGE_SERVICE_URL=minio:9000
STORAGE_SERVICE_REGION=gra
STORAGE_SERVICE_SECURE=false
STORAGE_SERVICE_BUCKET_NAME=open-crawler
HTML_FOLDER_NAME=html # TODO: Change value before build
METADATA_FOLDER_NAME=metadata # TODO: Change value before build

LOGGER_LEVEL=INFO

DEFAULT_RECRAWL_INTERVAL=30
3 changes: 3 additions & 0 deletions .github/workflows/staging.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ jobs:
steps:
- uses: actions/checkout@v3

- name : Install Packages
run : pip install -r requirements.txt

- name: test
run: python -m unittest

Expand Down
22 changes: 22 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Testing deployment

on:
push:

jobs:
unit-test:
name: run unit tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Set up Python 3.11
uses: actions/setup-python@v1
with:
python-version: 3.11

- name : Install Packages
run : pip install -r requirements.txt

- name: test
run: python -m unittest
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ This endpoint allows you to create a new website configuration end execute a cra
| `depth` | `integer` | Maximum depth to crawl (**Default**: 2) |
| `limit` | `integer` | Maximum pages to crawl (**Default**: 400) |
| `headers` | `dict[str, str]` | Headers that will be passed to all crawl requests (**Default**: {})|
| `accessibility` | `MetadataConfig` | Accessibility configuration (**Default**: {'enabled':True, 'depth' 0}) |
| `good_practices` | `MetadataConfig` | Good Practices configuration (**Default**: {'enabled': False}) |
| `lighthouse` | `MetadataConfig` | Lighthouse configuration (**Default**: {'enabled':True, 'depth' 0}) |
| `technologies` | `MetadataConfig` | Technologies configuration (**Default**: {'enabled': False}) |
| `responsiveness` | `MetadataConfig` | Responsiveness configuration (**Default**: {'enabled': False}) |
| `carbon_footprint` | `MetadataConfig` | Carbon Footprint configuration (**Default**: {'enabled': False}) |
Expand Down Expand Up @@ -132,7 +131,9 @@ To access the two collections, use a MongoDB console (such as MongoDB Compass fo
**website_crawl_parameters** collection:
![mongodb_config](./demo/mongodb_crawl_configuration.png)## Acces simple storage service

At the end of the crawl process, all crawled html pages and metadata files are uploaded to a simple storage service (s3).
At the end of the crawl process, all crawled html pages are uploaded to a simple storage service (s3).
The metadata are directly uploaded to the storage service.

The docker-compose file deploys a MinIO service that can be accessed at http://localhost:9090. (by default)

![minio](./demo/minio.png)
42 changes: 13 additions & 29 deletions app/api/crawls_router.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
import io
import os
from zipfile import ZipFile, ZIP_DEFLATED

from fastapi import HTTPException, APIRouter, status as statuscode
from fastapi.responses import StreamingResponse
from minio import Minio

import app.repositories as repositories
from app.api.utils import create_crawl, start_crawl
Expand Down Expand Up @@ -58,34 +53,23 @@ def list_crawls(
status_code=statuscode.HTTP_200_OK,
summary="Get a zip of all files from a crawl",
)
def get_crawl_files(website_id: str, crawl_id: str) -> StreamingResponse:
def get_crawl_files(crawl_id: str) -> StreamingResponse:
"""Zip the files from the storage service"""
client = Minio(
endpoint=os.environ["STORAGE_SERVICE_URL"],
access_key=os.environ["STORAGE_SERVICE_USERNAME"],
secret_key=os.environ["STORAGE_SERVICE_PASSWORD"],
secure=os.environ.get("STORAGE_SERVICE_SECURE", False),
region=os.environ.get("STORAGE_SERVICE_REGION", None),
)

bucket = os.environ["STORAGE_SERVICE_BUCKET_NAME"]
zip_io = io.BytesIO()
if not (crawl := repositories.crawls.get(website_id, crawl_id)):
raise HTTPException(
status_code=statuscode.HTTP_404_NOT_FOUND,
detail="Crawl not found",
)
url = crawl.config.url.replace("https://", "").replace("http://", "")
prefix = f"{url}/{crawl_id}"
objects = client.list_objects(bucket, prefix=prefix, recursive=True)
with ZipFile(zip_io, "a", ZIP_DEFLATED, False) as zipper:
for obj in objects:
file = client.get_object(bucket, obj.object_name).read()
zipper.writestr(obj.object_name, file)
zip_io = repositories.files.zip_all_crawl_files(crawl_id)
return StreamingResponse(
iter([zip_io.getvalue()]),
media_type="application/x-zip-compressed",
headers={
"Content-Disposition": f"attachment; filename={url}-{crawl_id}.zip"
"Content-Disposition": f"attachment; filename={crawl_id}.zip"
},
)


@crawls_router.delete(
"/{website_id}/crawls/{crawl_id}",
status_code=statuscode.HTTP_204_NO_CONTENT,
summary="Delete a crawl",
)
def delete_crawl(crawl_id: str) -> None:
"""Zip the files from the storage service"""
return repositories.files.delete_all_crawl_files(crawl_id)
4 changes: 3 additions & 1 deletion app/api/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from app.api.crawls_router import crawls_router
from app.api.websites_router import websites_router
from app.config import settings


def create_api_app() -> FastAPI:
Expand All @@ -18,7 +19,7 @@ def create_api_app() -> FastAPI:
)

# Configure CORS for non-production modes
deployment_mode = os.environ.get("MODE", "production")
deployment_mode = settings.MODE
if deployment_mode != "production":
api_app.add_middleware(
CORSMiddleware,
Expand All @@ -27,6 +28,7 @@ def create_api_app() -> FastAPI:
allow_methods=["*"],
allow_headers=["*"],
)
# TODO: Configure CORS for production mode

api_app.include_router(websites_router)
api_app.include_router(crawls_router)
Expand Down
19 changes: 17 additions & 2 deletions app/api/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
from urllib.parse import urlparse

from celery import group, chain

import app.repositories as repositories
from app.celery_broker.tasks import (
METADATA_TASK_REGISTRY,
start_crawl_process,
upload_html,
)
from app.models.crawl import CrawlModel
from app.models.enums import ProcessStatus
from app.models.website import WebsiteModel
from app.services.crawler_logger import logger


def create_crawl(website: WebsiteModel) -> CrawlModel:

# Check if the path component of the URL is empty or "/"
# If the crawl target is a single page, we will ignore the depth and the limit in the request.
if not is_domain(website.url):
website.depth = 0
website.limit = 1
logger.warning("The url to crawl is not a domain. Only one page will be crawled")

crawl: CrawlModel = CrawlModel(
website_id=website.id,
config=website.to_config(),
Expand All @@ -32,5 +42,10 @@ def start_crawl(crawl: CrawlModel) -> None:
chain(
start_crawl_process.s(crawl),
metadata_tasks,
upload_html.si(crawl),
).apply_async(task_id=crawl.id)


def is_domain(url: str) -> bool:
parsed_url = urlparse(url)
return parsed_url.path == '' or parsed_url.path == '/'

11 changes: 11 additions & 0 deletions app/celery_broker/crawler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,14 @@ def start_crawler_process(crawl_process: CrawlProcess, results: dict):
process.crawl(MenesrSpider, crawl_process=crawl_process)
process.start()
results["metadata"] = dict(crawl_process.metadata.items())


def set_html_crawl_status(crawl: CrawlModel, request_id: str, status: ProcessStatus):
crawl.html_crawl.update(
task_id=request_id, status=status
)
repositories.crawls.update_task(
crawl_id=crawl.id,
task_name="html_crawl",
task=crawl.html_crawl,
)
6 changes: 3 additions & 3 deletions app/celery_broker/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

from celery import Celery

from app.celery_broker.config import settings
from app.config import settings


def create_celery_app() -> Celery:
celery_app = Celery(
"scanr",
broker=os.environ.get("CELERY_BROKER_URL"),
backend=os.environ.get("CELERY_RESULT_BACKEND"),
broker=settings.CELERY_BROKER_URL,
backend=settings.CELERY_RESULT_BACKEND,
broker_connection_retry_on_startup=True,
include=["app.celery_broker.tasks"],
)
Expand Down
20 changes: 7 additions & 13 deletions app/celery_broker/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
from app.models.enums import MetadataType, ProcessStatus
from app.models.metadata import MetadataTask
from app.models.process import CrawlProcess
from app.services.accessibility_best_practices_calculator import (
AccessibilityError,
BestPracticesError,
)
from app.services.lighthouse_calculator import LighthouseError
from app.services.carbon_calculator import CarbonCalculatorError
from app.services.crawler_logger import logger
from app.services.responsiveness_calculator import ResponsivenessCalculatorError
Expand Down Expand Up @@ -46,14 +43,12 @@ def handle_metadata_result(
def store_metadata_result(
crawl_process: CrawlProcess, result: dict, metadata_type: MetadataType
):
base_file_path = (
f"/{os.environ['LOCAL_FILES_PATH'].strip('/')}/{crawl_process.id}"
)
file_path = pathlib.Path(
f"{base_file_path}/{os.environ['METADATA_FOLDER_NAME'].strip('/')}/{metadata_type}.json"
return repositories.files.store_metadata_file(
crawl_id=crawl_process.id,
object_name=f"{metadata_type}.json",
content_type='application/json',
data=json.dumps(result, indent=2, default=str)
)
file_path.parent.mkdir(exist_ok=True, parents=True)
file_path.write_text(json.dumps(result, indent=4))


def metadata_task(
Expand All @@ -78,8 +73,7 @@ def metadata_task(
data = calc_method(url)
result[url] = data
except (
AccessibilityError,
BestPracticesError,
LighthouseError,
TechnologiesError,
ResponsivenessCalculatorError,
CarbonCalculatorError,
Expand Down
Loading

0 comments on commit 1e0f2fd

Please sign in to comment.