Skip to content

Commit

Permalink
Remove uploads queue
Browse files Browse the repository at this point in the history
  • Loading branch information
Pandalei97 committed Nov 15, 2023
1 parent db5f558 commit 00d3ba5
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 21 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ To access the two collections, use a MongoDB console (such as MongoDB Compass fo
**website_crawl_parameters** collection:
![mongodb_config](./demo/mongodb_crawl_configuration.png)## Acces simple storage service

At the end of the crawl process, all crawled html pages and metadata files are uploaded to a simple storage service (s3).
At the end of the crawl process, all crawled html pages are uploaded to a simple storage service (s3).
The metadata are directly uploaded to the storage service.

The docker-compose file deploys a MinIO service that can be accessed at http://localhost:9090. (by default)

![minio](./demo/minio.png)
12 changes: 1 addition & 11 deletions app/celery_broker/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ class BaseConfig:
TECHNOLOGIES_QUEUE_NAME = "technologies_queue"
RESPONSIVENESS_QUEUE_NAME = "responsiveness_queue"
CARBON_QUEUE_NAME = "carbon_footprint_queue"
UPLOAD_QUEUE_NAME = "upload_queue"

# The following two lines make celery execute tasks locally
# task_always_eager = True
Expand Down Expand Up @@ -45,11 +44,6 @@ class BaseConfig:
Exchange(CARBON_QUEUE_NAME),
routing_key=CARBON_QUEUE_NAME,
),
Queue(
UPLOAD_QUEUE_NAME,
Exchange(UPLOAD_QUEUE_NAME),
routing_key=UPLOAD_QUEUE_NAME,
),
)

task_routes = {
Expand All @@ -69,11 +63,7 @@ class BaseConfig:
"get_carbon_footprint": {
"queue": CARBON_QUEUE_NAME,
"routing_key": CARBON_QUEUE_NAME,
},
"upload_html": {
"queue": UPLOAD_QUEUE_NAME,
"routing_key": UPLOAD_QUEUE_NAME,
},
}
}

def get(self, attribute_name: str):
Expand Down
1 change: 0 additions & 1 deletion app/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class CrawlModel(BaseModel):
technologies_and_trackers: MetadataTask | None = None
responsiveness: MetadataTask | None = None
carbon_footprint: MetadataTask | None = None
uploads: BaseTaskModel = Field(default_factory=BaseTaskModel)

@property
def enabled_metadata(self) -> list[MetadataType]:
Expand Down
1 change: 0 additions & 1 deletion client/src/_types/crawls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ export type Crawl = {
responsiveness: MetadataResult;
carbon_footprint: MetadataResult;
html_crawl: MetadataResult;
uploads: MetadataResult;
}

export type CrawlCount = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ import { Crawl, MetadataResult } from '../../../../../_types/crawls'
import { timeBetween } from '../utils/dates';
import { getJobStatus } from '../utils/status';

type Metadata = 'html_crawl' | 'lighthouse' | 'responsiveness' | 'technologies_and_trackers' | 'carbon_footprint' | 'uploads';
const metadatas: Metadata[] = ['html_crawl', 'lighthouse', 'responsiveness', 'technologies_and_trackers', 'carbon_footprint', 'uploads']
type Metadata = 'html_crawl' | 'lighthouse' | 'responsiveness' | 'technologies_and_trackers' | 'carbon_footprint';
const metadatas: Metadata[] = ['html_crawl', 'lighthouse', 'responsiveness', 'technologies_and_trackers', 'carbon_footprint']

const nameMap: { [key in Metadata]: string } = {
html_crawl: 'Crawl',
lighthouse: 'LightHouse',
responsiveness: 'Responsive',
technologies_and_trackers: 'Technologies',
carbon_footprint: 'Empreinte carbone',
uploads: 'Upload'
}

function downloadFiles(url: string) {
Expand Down
13 changes: 9 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,33 +42,38 @@ services:
deploy:
restart_policy:
condition: on-failure
replicas: 1

lighthouse_worker:
<<: *worker
build:
context: .
dockerfile: Lighthouse.Dockerfile
command: watchfiles --filter python '/bin/bash -c "source /opt/venv/bin/activate && celery -A celery_broker.main.celery_app worker -l info -P solo -n lighthouse_worker -Q lighthouse_queue"'
deploy:
replicas: 1

technologies_worker:
<<: *worker
build:
context: .
dockerfile: Wappalyzer.Dockerfile
command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P solo -n technologies_worker -Q technologies_queue'
deploy:
replicas: 1

responsiveness_worker:
<<: *worker
command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P solo -n responsiveness_worker -Q responsiveness_queue'
deploy:
replicas: 1

carbon_footprint_worker:
<<: *worker
command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P solo -n carbon_footprint_worker -Q carbon_footprint_queue'
deploy:
replicas: 1

upload_worker:
<<: *worker
command: watchfiles --filter python 'celery -A celery_broker.main.celery_app worker -l info -P solo -n upload_worker -Q upload_queue'
env_file: .env



Expand Down

0 comments on commit 00d3ba5

Please sign in to comment.