-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
68 changed files
with
1,042 additions
and
1,415 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,13 +106,6 @@ jobs: | |
with: | ||
namespace: ${{ env.DEPLOYMENT_NAMESPACE }} | ||
restart: oc-client | ||
- name: Restart oc-responsiveness-worker deployment | ||
uses: dataesr/[email protected] | ||
env: | ||
KUBE_CONFIG: ${{ secrets.KUBE_CONFIG_DOAD_STAGING }} | ||
with: | ||
namespace: ${{ env.DEPLOYMENT_NAMESPACE }} | ||
restart: oc-responsiveness-worker | ||
- name: Restart oc-technologies-worker deployment | ||
uses: dataesr/[email protected] | ||
env: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from app.api.crawls import crawls_router | ||
from app.api.websites import websites_router | ||
|
||
__all__ = ["crawls_router", "websites_router"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,39 @@ | ||
from urllib.parse import urlparse | ||
|
||
from celery import group, chain, chord | ||
|
||
from app.repositories.crawls import crawls | ||
from app.celery_broker.tasks import ( | ||
METADATA_TASK_REGISTRY, | ||
start_crawl_process, finalize_crawl_process, | ||
from app.models.enums import MetadataType | ||
|
||
from app.tasks import ( | ||
get_lighthouse, | ||
get_technologies, | ||
get_carbon_footprint, | ||
get_html_crawl, | ||
finalize_crawl_process, | ||
) | ||
from app.models.crawl import CrawlModel | ||
from app.models.website import WebsiteModel | ||
from app.services.crawler_logger import logger | ||
|
||
|
||
def create_crawl(website: WebsiteModel) -> CrawlModel: | ||
|
||
# Check if the path component of the URL is empty or "/" | ||
# If the crawl target is a single page, we will ignore the depth and the limit in the request. | ||
if not is_domain(website.url): | ||
website.depth = 0 | ||
website.limit = 1 | ||
logger.warning("The url to crawl is not a domain. Only one page will be crawled") | ||
from app.models.crawl import CrawlModel | ||
from app.services.logging import logger | ||
|
||
crawl: CrawlModel = CrawlModel( | ||
website_id=website.id, | ||
config=website.to_config(), | ||
) | ||
crawl.init_tasks() | ||
crawls.create(crawl) | ||
return crawl | ||
METADATA_TASK_REGISTRY = { | ||
MetadataType.LIGHTHOUSE: get_lighthouse, | ||
MetadataType.TECHNOLOGIES: get_technologies, | ||
MetadataType.CARBON_FOOTPRINT: get_carbon_footprint, | ||
} | ||
|
||
|
||
def start_crawl(crawl: CrawlModel) -> None: | ||
logger.info(f"New crawl process ({crawl.id}) for website {crawl.url}") | ||
logger.info( | ||
f"New crawl process ({crawl.id}) for website {crawl.config.url}" | ||
f"New crawl process ({crawl.id}) for website {crawl.url}" | ||
) | ||
metadata_tasks = group( | ||
METADATA_TASK_REGISTRY.get(metadata).s() | ||
METADATA_TASK_REGISTRY.get(metadata).si(crawl.id) | ||
for metadata in crawl.enabled_metadata | ||
) | ||
# If a task in a chain fails, the remaining tasks in the chain will not be executed. | ||
# To ensure that `finalize_crawl` is executed regardless of whether the previous tasks in the chain fail or succeed, | ||
# We need to put it in the `link_error` callback in start_crawl_process and do a chord with the metadata tasks. | ||
chain( | ||
start_crawl_process.s(crawl).on_error(finalize_crawl_process.s(crawl)), | ||
chord(metadata_tasks, finalize_crawl_process.s(crawl)), | ||
get_html_crawl.si(crawl.id).on_error( | ||
finalize_crawl_process.si(crawl.id)), | ||
chord(metadata_tasks, finalize_crawl_process.si(crawl.id)), | ||
finalize_crawl_process.si(crawl.id) | ||
).apply_async(task_id=crawl.id) | ||
|
||
|
||
def is_domain(url: str) -> bool: | ||
parsed_url = urlparse(url) | ||
return parsed_url.path == '' or parsed_url.path == '/' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.