From ab84b15dd2c83b7e057999251715286ed6ed1f29 Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Thu, 12 Oct 2023 13:22:14 +0100 Subject: [PATCH] Support pagination of search results --- easy_entrez/api.py | 40 ++++++++++++++++++++--- easy_entrez/batch.py | 74 +++++++++++++++++++++++++++++++++++++++++- easy_entrez/queries.py | 12 +++---- 3 files changed, 113 insertions(+), 13 deletions(-) diff --git a/easy_entrez/api.py b/easy_entrez/api.py index 4151cb3..6dd9a54 100644 --- a/easy_entrez/api.py +++ b/easy_entrez/api.py @@ -5,8 +5,9 @@ from xml.etree import ElementTree from copy import copy from time import time, sleep +from warnings import warn -from .batch import supports_batches +from .batch import supports_batches, supports_pagination from .types import ReturnType, DataType, EntrezDatabase, CommandType, Citation from .queries import ( EntrezQuery, SearchQuery, SummaryQuery, FetchQuery, LinkQuery, InfoQuery, CitationQuery, uses_query, @@ -97,6 +98,8 @@ def __init__( self.minimal_interval = minimal_interval self._batch_size: Optional[int] = None self._batch_sleep_interval: int = 3 + self._page_size: Optional[int] = None + self._page_sleep_interval: int = 3 self._last_request_time = None self.timeout = timeout @@ -138,17 +141,23 @@ def _request(self, query: EntrezQuery, custom_payload=None) -> EntrezResponse: return EntrezResponse(query=query, response=response, api=self) - # TODO: make entrez response a generic and provide better typing of responses + # TODO: make entrez response a generic and provide better typing of response + @supports_pagination @uses_query(SearchQuery) def search( - self, term: Union[str, dict], max_results: int, - database: EntrezDatabase = 'pubmed', min_date=None, max_date=None + self, term: Union[str, dict], max_results: Optional[int] = None, + database: EntrezDatabase = 'pubmed', min_date=None, max_date=None, + resume_from: Optional[int] = None ): if isinstance(term, dict): term = _match_all(**term) assert not min_date and not max_date # TODO - query = SearchQuery(term=term, max_results=max_results, database=database) + self._ensure_max_results(max_results) + query = SearchQuery( + term=term, max_results=max_results, database=database, + resume_from=resume_from + ) return self._request(query=query) def in_batches_of(self, size: int = 100, sleep_interval: int = 3): @@ -157,6 +166,13 @@ def in_batches_of(self, size: int = 100, sleep_interval: int = 3): batch_mode._batch_sleep_interval = sleep_interval return batch_mode + def page_by_page(self, size: int = 100, sleep_interval: int = 3): + """Experimental pagination mode allowing to download all search results page by page.""" + pagination_mode = copy(self) + pagination_mode._page_size = size + pagination_mode._page_sleep_interval = sleep_interval + return pagination_mode + @supports_batches @uses_query(SummaryQuery) def summarize( @@ -217,3 +233,17 @@ def _ensure_list_like(ids: List[str]): raise ValueError( f'Received {atomic_iterable_type.__name__} but a list-like container of identifiers was expected' ) + + def _ensure_max_results(self, max_results): + if self._page_size is None and max_results is None: + raise ValueError( + 'Please specify `max_results`, or use pagination mode' + ' `api.page_by_page().search()`' + ) + if self._page_size is not None and max_results is None: + warn('`max_results` has no effect in pagination mode') + if max_results > 10_000: + raise ValueError( + 'Fetching more than 10,000 results requires enabling pagination' + ' with `api.page_by_page().search()`' + ) \ No newline at end of file diff --git a/easy_entrez/batch.py b/easy_entrez/batch.py index 4ae2a3e..1681878 100644 --- a/easy_entrez/batch.py +++ b/easy_entrez/batch.py @@ -6,11 +6,18 @@ from requests import RequestException +class TqdmMock: + total: int + def update(self, i: int): + pass + try: from tqdm import tqdm except ImportError: - def tqdm(iterable): + def tqdm(iterable=None): + if iterable is None: + return TqdmMock() return iterable @@ -72,7 +79,72 @@ def batches_support_wrapper(self: 'EntrezAPI', collection: Sequence, *args, **kw return batches_support_wrapper +def supports_pagination(func): + """ + Call the decorated functions with the collection from the first argument + (second if counting with self) split into pages, resuming on failures + with a interval twice the between-page interval. + """ + + @wraps(func) + def pagination_support_wrapper(self: 'EntrezAPI', *args, **kwargs): + size = self._page_size + interval = self._page_sleep_interval + if size is not None: + assert isinstance(size, int) + by_page = {} + page = 0 + count = None + downloaded = 0 + progress = tqdm() + if 'max_results' in kwargs: + del kwargs['max_results'] + + finished = False + while not finished: + done = False + + while not done: + reason = None + try: + page_result = func(self, *args, **kwargs, resume_from=page * size, max_results=size) + code = page_result.response.status_code + result_type = page_result.data['header']['type'] + result_info = page_result.data[f'{result_type}result'] + count = int(result_info['count']) + progress.total = count + downloaded += size + page += 1 + progress.update(downloaded) + assert page * size == int(result_info['retstart']) + assert size == int(result_info['retmax']) + if code == 200: + done = True + else: + reason = f'Status code != 200 (= {code})' + except RequestException as e: + reason = e + + if not done: + warn( + f'Failed to fetch for {page}-th page, retrying in {interval * 2} seconds.' + f' The reason was: {reason}' + ) + sleep(interval * 2) + if count is None: + raise ValueError('Count not set after first page') + if downloaded >= count: + finished = True + by_page[page] = page_result + sleep(interval) + return by_page + else: + return func(self, collection, *args, **kwargs) + if not pagination_support_wrapper.__doc__: + pagination_support_wrapper.__doc__ = '' + pagination_support_wrapper.__doc__ += '\n Supports pagination mode, see :py:meth:`~EntrezAPI.page_by_page`.' + return pagination_support_wrapper diff --git a/easy_entrez/queries.py b/easy_entrez/queries.py index 0a3a269..0f7f43f 100644 --- a/easy_entrez/queries.py +++ b/easy_entrez/queries.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Dict, List, Iterable, Type +from typing import Dict, List, Iterable, Optional, Type from typing_extensions import Literal from warnings import warn @@ -86,16 +86,14 @@ class SearchQuery(EntrezQuery): """ endpoint = 'esearch' term: str - max_results: int - - def validate(self): - super().validate() - if self.max_results > 100_000: - raise ValueError('Fetching more than 100,000 results is not implemented') + max_results: Optional[int] = None + resume_from: Optional[int] = None def to_params(self) -> Dict[str, str]: params = super().to_params() params['retmax'] = str(self.max_results) + if self.resume_from is not None: + params['retstart'] = str(self.resume_from) params['term'] = self.term return params