From 06d41a66429523c38f5199feb9fa87b71b37bb91 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Mon, 18 Sep 2023 22:14:43 +0100 Subject: [PATCH] Replace `elide_data_returned` with count timeout --- optimade/server/config.py | 7 +++--- .../entry_collections/entry_collections.py | 2 +- optimade/server/entry_collections/mongo.py | 24 +++++++++++-------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/optimade/server/config.py b/optimade/server/config.py index cc9e2db8fa..3a3e7cea71 100644 --- a/optimade/server/config.py +++ b/optimade/server/config.py @@ -157,9 +157,10 @@ class ServerConfig(BaseSettings): None, description="Host settings to pass through to the `Elasticsearch` class." ) - elide_data_returned: bool = Field( - False, - description="Whether to skip counting all the results for every query (to set the `data_returned` field), as this may be too strenuous for large databases. Currently only supports MongoDB.", + mongo_count_timeout: int = Field( + 5, + description="""Number of seconds to allow MongoDB to perform a full database count before falling back to `null`. +This operation can require a full COLLSCAN for empty queries which can be prohibitively slow if the database does not fit into the active set, hence a timeout can drastically speed-up response times.""", ) mongo_database: str = Field( diff --git a/optimade/server/entry_collections/entry_collections.py b/optimade/server/entry_collections/entry_collections.py index 0722f74097..d59bb8713d 100644 --- a/optimade/server/entry_collections/entry_collections.py +++ b/optimade/server/entry_collections/entry_collections.py @@ -126,7 +126,7 @@ def insert(self, data: List[EntryResource]) -> None: """ @abstractmethod - def count(self, **kwargs: Any) -> int: + def count(self, **kwargs: Any) -> Union[int, None]: """Returns the number of entries matching the query specified by the keyword arguments. diff --git a/optimade/server/entry_collections/mongo.py b/optimade/server/entry_collections/mongo.py index 1c70f01992..b7bd20f9b9 100644 --- a/optimade/server/entry_collections/mongo.py +++ b/optimade/server/entry_collections/mongo.py @@ -10,6 +10,7 @@ if CONFIG.database_backend.value == "mongodb": from pymongo import MongoClient, version_tuple + from pymongo.errors import ExecutionTimeout if version_tuple[0] < 4: LOGGER.warning( @@ -67,9 +68,9 @@ def __len__(self) -> int: """Returns the total number of entries in the collection.""" return self.collection.estimated_document_count() - def count(self, **kwargs: Any) -> int: + def count(self, **kwargs: Any) -> Union[int, None]: """Returns the number of entries matching the query specified - by the keyword arguments. + by the keyword arguments, or `None` if the count timed out. Parameters: **kwargs: Query parameters as keyword arguments. The keys @@ -80,11 +81,15 @@ def count(self, **kwargs: Any) -> int: for k in list(kwargs.keys()): if k not in ("filter", "skip", "limit", "hint", "maxTimeMS"): del kwargs[k] - if "filter" not in kwargs: # "filter" is needed for count_documents() - kwargs["filter"] = {} + if "filter" not in kwargs: return self.collection.estimated_document_count() else: - return self.collection.count_documents(**kwargs) + if "maxTimeMS" not in kwargs: + kwargs["maxTimeMS"] = 1000 * CONFIG.mongo_count_timeout + try: + return self.collection.count_documents(**kwargs) + except ExecutionTimeout: + return None def insert(self, data: List[EntryResource]) -> None: """Add the given entries to the underlying database. @@ -164,13 +169,12 @@ def _run_db_query( criteria_nolimit = criteria.copy() criteria_nolimit.pop("limit", None) skip = criteria_nolimit.pop("skip", 0) - if CONFIG.elide_data_returned: - data_returned = None - # Only correct most of the time: if the total number of remaining results is exactly the page limit - # then this will incorrectly say there is more_data_available + data_returned = self.count(**criteria_nolimit) + # Only correct most of the time: if the total number of remaining results is exactly the page limit + # then this will incorrectly say there is more_data_available + if data_returned is None: more_data_available = nresults_now == criteria.get("limit", 0) else: - data_returned = self.count(**criteria_nolimit) more_data_available = nresults_now + skip < data_returned else: # SingleEntryQueryParams, e.g., /structures/{entry_id}