Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes #35

Merged
merged 11 commits into from
Dec 15, 2023
6 changes: 0 additions & 6 deletions app/celery_broker/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ def metadata_task(
task_name=metadata_type,
task=task,
)
crawls.update_status(
crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
)
continue
except Exception as e:
logger.error(
Expand All @@ -102,8 +99,5 @@ def metadata_task(
task_name=metadata_type,
task=task,
)
crawls.update_status(
crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
)
continue
return handle_metadata_result(task, crawl_process, result, metadata_type)
40 changes: 29 additions & 11 deletions app/celery_broker/tasks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Standard library imports
import asyncio
import os
import pathlib
import shutil
from multiprocessing import Process, Manager
from multiprocessing import Manager, Process
from typing import Optional

# Local imports
Expand Down Expand Up @@ -49,14 +50,17 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
kwargs={"crawl_process": crawl_process, "results": shared_dict},
)
p.start()
p.join() # TODO define and add a timeout
p.join(120) # Wait 120 seconds for the crawler to finish
if p.is_alive():
logger.error("Crawler timed out, the crawl may not contain enough pages")
p.terminate()
p.join()

crawl_process.metadata.update(shared_dict["metadata"])

except Exception as e:
logger.error(f"Error while crawling html files: {e}")
set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.ERROR
)
self.update_state(state='FAILURE')
return crawl_process
try:
Expand All @@ -66,9 +70,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
logger.error(f"Error while uploading html files: {e}")
# Html crawl will be considered failed if we can't upload the html files
set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.ERROR
)
self.update_state(state='FAILURE')
return crawl_process

Expand Down Expand Up @@ -131,10 +132,27 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C
# Retrieve the current status of the crawl
current_crawl = crawls.get(crawl_id=crawl.id)

have_success = False

# Retrieve the status of all the sub tasks (html_crawl, lighthouse, technologies, responsiveness, carbon_footprint)
# If one of the sub tasks failed, we consider the crawl as partially failed
all_tasks = [current_crawl.html_crawl, current_crawl.lighthouse, current_crawl.technologies_and_trackers, current_crawl.responsiveness, current_crawl.carbon_footprint]
for task in all_tasks:
if task is None:
continue
if task.status != ProcessStatus.SUCCESS:
current_crawl.status = ProcessStatus.PARTIAL_ERROR
else:
have_success = True

if current_crawl.status == ProcessStatus.STARTED:
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.SUCCESS
)
current_crawl.status = ProcessStatus.SUCCESS
elif not have_success:
current_crawl.status = ProcessStatus.ERROR

crawls.update_status(
crawl_id=crawl.id, status=current_crawl.status
)

websites.store_last_crawl(
website_id=crawl.website_id,
Expand Down
18 changes: 17 additions & 1 deletion app/crawler/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from pathlib import Path
from urllib.parse import urlparse

from app.config import settings

Expand Down Expand Up @@ -48,7 +49,7 @@ def from_crawler(cls, crawler):
def _format_file_path(self, response, spider) -> Path:
domain = spider.allowed_domains[0]
base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}"
file_name = response.url.split(f"{domain}")[-1].lstrip('/')
file_name = response.url.split(f"{domain}")[-1].strip('/')
if not file_name.endswith(".html"):
file_name = f"{file_name}.html"
if file_name == ".html":
Expand All @@ -63,12 +64,27 @@ def _save_html_locally(self, response, spider):
file_path.write_text(response.text)

def process_response(self, request, response, spider):

if self.page_limit != 0 and self.current_page_count >= self.page_limit:
raise IgnoreRequest(
f"Page limit reached. Ignoring request {request}"
)

if request.url.endswith("robots.txt"):
return response

if spider.first_real_url is None:
# We should set the allowed_url and allowed_domains only once for the first request.
# It is useful when we have a permanent redirection in the first request.
spider.first_real_url = response.url
parsed_url = urlparse(spider.first_real_url)
if parsed_url.path:
spider.allowed_url = parsed_url.path
else:
spider.allowed_url = parsed_url.netloc
spider.allowed_domains = [parsed_url.netloc]


if response.status == 200:
self.current_page_count += 1
self._save_html_locally(response, spider)
Expand Down
20 changes: 7 additions & 13 deletions app/crawler/spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from urllib.parse import urlparse

from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Expand All @@ -11,37 +9,33 @@ class MenesrSpider(CrawlSpider):
rules = (Rule(),)
use_playwright = False
allowed_url = None
first_real_url = None
page_count = 0
page_limit = 0
depth_limit = 0

def __init__(self, crawl_process: CrawlProcess, *a, **kw):
parsed_url = urlparse(crawl_process.config.url)
self.use_playwright = crawl_process.config.parameters.use_playwright
if parsed_url.path:
self.allowed_url = parsed_url.path
self.page_limit = crawl_process.config.parameters.limit
self.depth_limit = crawl_process.config.parameters.depth
self.allowed_domains = [parsed_url.netloc]
self.start_urls = [crawl_process.config.url]
self.crawl_process = crawl_process
super().__init__(*a, **kw)


def start_requests(self):
for url in self.start_urls:
meta = {
"depth": 0, # Set the initial depth to 0
}
if self.use_playwright:
yield Request(url, self.parse, meta={
"depth": 0, # Set the initial depth to 0
meta.update({
"playwright": True,
"playwright_page_methods": [
("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
]
})
else:
yield Request(url, self.parse, meta={
"depth": 0, # Set the initial depth to 0
],
})
yield Request(url, self.parse, meta=meta)

def parse(self, response, **kwargs):
# Crawl the links in the response page and continue to crawl the next page
Expand Down
2 changes: 1 addition & 1 deletion app/repositories/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def update_status(self, crawl_id: str, status: ProcessStatus):
update_dict = {"status": status}
if status == ProcessStatus.STARTED:
update_dict["started_at"] = french_datetime()
if status == ProcessStatus.SUCCESS:
else:
update_dict["finished_at"] = french_datetime()
self.collection.update_one(
filter={"id": crawl_id},
Expand Down
3 changes: 0 additions & 3 deletions tests/tests_crawler/test_menesr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,13 @@ def test_init_without_path(self):
self.mock_crawl_process.config.url = "http://example.com/"
spider = MenesrSpider(self.mock_crawl_process)

self.assertEqual(spider.allowed_domains, ["example.com"])
self.assertEqual(spider.start_urls, ["http://example.com/"])
self.assertTrue(isinstance(spider.rules, tuple))

def test_init_with_path(self):
spider = MenesrSpider(self.mock_crawl_process)

# Checking initialized values
parsed_url = urlparse(self.mock_crawl_process.config.url)
self.assertEqual(spider.allowed_domains, [parsed_url.netloc])
self.assertEqual(
spider.start_urls, [self.mock_crawl_process.config.url]
)
Expand Down
Loading