Skip to content

Commit

Permalink
Keep WARC download running on individual connection errors
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 27, 2023
1 parent d43a466 commit 48678da
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions archive_query_log/downloaders/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
from itertools import chain
from typing import Iterable, Iterator, TypeVar, Generic, Type, Callable
from uuid import uuid5
from warnings import warn

from click import echo
from elasticsearch_dsl import Search
from elasticsearch_dsl.function import RandomScore
from elasticsearch_dsl.query import Exists, FunctionScore, Term, RankFeature
from requests import ConnectionError as RequestsConnectionError
from tqdm.auto import tqdm
from warc_s3 import WarcS3Record
from warcio.recordloader import ArcWarcRecord
Expand Down Expand Up @@ -71,11 +73,18 @@ def _download_serp_warc(
api_url=serp.archive.memento_api_url,
session=config.http.session,
)
records = memento_api.load_url_warc(
url=serp.capture.url,
timestamp=serp.capture.timestamp,
raw=True,
)
try:
records = memento_api.load_url_warc(
url=serp.capture.url,
timestamp=serp.capture.timestamp,
raw=True,
)
except RequestsConnectionError:
warn(RuntimeWarning(
f"Connection error while downloading WARC "
f"for capture URL {serp.capture.url} at {serp.capture.timestamp}."
))
return
for record in records:
yield _SerpArcWarcRecord(serp, record)

Expand Down

0 comments on commit 48678da

Please sign in to comment.