Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support pagination of search results #21

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions easy_entrez/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from xml.etree import ElementTree
from copy import copy
from time import time, sleep
from warnings import warn

from .batch import supports_batches
from .batch import supports_batches, supports_pagination
from .types import ReturnType, DataType, EntrezDatabase, CommandType, Citation
from .queries import (
EntrezQuery, SearchQuery, SummaryQuery, FetchQuery, LinkQuery, InfoQuery, CitationQuery, uses_query,
Expand Down Expand Up @@ -97,6 +98,8 @@ def __init__(
self.minimal_interval = minimal_interval
self._batch_size: Optional[int] = None
self._batch_sleep_interval: int = 3
self._page_size: Optional[int] = None
self._page_sleep_interval: int = 3
self._last_request_time = None
self.timeout = timeout

Expand Down Expand Up @@ -138,17 +141,23 @@ def _request(self, query: EntrezQuery, custom_payload=None) -> EntrezResponse:

return EntrezResponse(query=query, response=response, api=self)

# TODO: make entrez response a generic and provide better typing of responses
# TODO: make entrez response a generic and provide better typing of response
@supports_pagination
@uses_query(SearchQuery)
def search(
self, term: Union[str, dict], max_results: int,
database: EntrezDatabase = 'pubmed', min_date=None, max_date=None
self, term: Union[str, dict], max_results: Optional[int] = None,
database: EntrezDatabase = 'pubmed', min_date=None, max_date=None,
resume_from: Optional[int] = None
):
if isinstance(term, dict):
term = _match_all(**term)

assert not min_date and not max_date # TODO
query = SearchQuery(term=term, max_results=max_results, database=database)
self._ensure_max_results(max_results)
query = SearchQuery(
term=term, max_results=max_results, database=database,
resume_from=resume_from
)
return self._request(query=query)

def in_batches_of(self, size: int = 100, sleep_interval: int = 3):
Expand All @@ -157,6 +166,13 @@ def in_batches_of(self, size: int = 100, sleep_interval: int = 3):
batch_mode._batch_sleep_interval = sleep_interval
return batch_mode

def page_by_page(self, size: int = 100, sleep_interval: int = 3):
"""Experimental pagination mode allowing to download all search results page by page."""
pagination_mode = copy(self)
pagination_mode._page_size = size
pagination_mode._page_sleep_interval = sleep_interval
return pagination_mode

@supports_batches
@uses_query(SummaryQuery)
def summarize(
Expand Down Expand Up @@ -217,3 +233,17 @@ def _ensure_list_like(ids: List[str]):
raise ValueError(
f'Received {atomic_iterable_type.__name__} but a list-like container of identifiers was expected'
)

def _ensure_max_results(self, max_results):
if self._page_size is None and max_results is None:
raise ValueError(
'Please specify `max_results`, or use pagination mode'
' `api.page_by_page().search(<your arguments>)`'
)
if self._page_size is not None and max_results is None:
warn('`max_results` has no effect in pagination mode')
if max_results > 10_000:
raise ValueError(
'Fetching more than 10,000 results requires enabling pagination'
' with `api.page_by_page().search(<your arguments>)`'
)
74 changes: 73 additions & 1 deletion easy_entrez/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@

from requests import RequestException

class TqdmMock:
total: int
def update(self, i: int):
pass


try:
from tqdm import tqdm
except ImportError:
def tqdm(iterable):
def tqdm(iterable=None):
if iterable is None:
return TqdmMock()
return iterable


Expand Down Expand Up @@ -72,7 +79,72 @@ def batches_support_wrapper(self: 'EntrezAPI', collection: Sequence, *args, **kw
return batches_support_wrapper


def supports_pagination(func):
"""
Call the decorated functions with the collection from the first argument
(second if counting with self) split into pages, resuming on failures
with a interval twice the between-page interval.
"""

@wraps(func)
def pagination_support_wrapper(self: 'EntrezAPI', *args, **kwargs):
size = self._page_size
interval = self._page_sleep_interval
if size is not None:
assert isinstance(size, int)
by_page = {}
page = 0
count = None
downloaded = 0
progress = tqdm()
if 'max_results' in kwargs:
del kwargs['max_results']

finished = False
while not finished:
done = False

while not done:
reason = None
try:
page_result = func(self, *args, **kwargs, resume_from=page * size, max_results=size)
code = page_result.response.status_code
result_type = page_result.data['header']['type']
result_info = page_result.data[f'{result_type}result']
count = int(result_info['count'])
progress.total = count
downloaded += size
page += 1
progress.update(downloaded)
assert page * size == int(result_info['retstart'])
assert size == int(result_info['retmax'])
if code == 200:
done = True
else:
reason = f'Status code != 200 (= {code})'
except RequestException as e:
reason = e

if not done:
warn(
f'Failed to fetch for {page}-th page, retrying in {interval * 2} seconds.'
f' The reason was: {reason}'
)
sleep(interval * 2)

if count is None:
raise ValueError('Count not set after first page')
if downloaded >= count:
finished = True
by_page[page] = page_result
sleep(interval)
return by_page
else:
return func(self, collection, *args, **kwargs)

if not pagination_support_wrapper.__doc__:
pagination_support_wrapper.__doc__ = ''

pagination_support_wrapper.__doc__ += '\n Supports pagination mode, see :py:meth:`~EntrezAPI.page_by_page`.'

return pagination_support_wrapper
12 changes: 5 additions & 7 deletions easy_entrez/queries.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, List, Iterable, Type
from typing import Dict, List, Iterable, Optional, Type
from typing_extensions import Literal
from warnings import warn

Expand Down Expand Up @@ -86,16 +86,14 @@ class SearchQuery(EntrezQuery):
"""
endpoint = 'esearch'
term: str
max_results: int

def validate(self):
super().validate()
if self.max_results > 100_000:
raise ValueError('Fetching more than 100,000 results is not implemented')
max_results: Optional[int] = None
resume_from: Optional[int] = None

def to_params(self) -> Dict[str, str]:
params = super().to_params()
params['retmax'] = str(self.max_results)
if self.resume_from is not None:
params['retstart'] = str(self.resume_from)
params['term'] = self.term
return params

Expand Down
Loading