Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add config option mongo_count_timeout to skip the global count per request #1757

Merged
merged 5 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ jobs:

services:
mongo:
image: mongo:6
image: mongo:7
ports:
- 27017:27017
postgres:
Expand Down
6 changes: 6 additions & 0 deletions optimade/server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@ class ServerConfig(BaseSettings):
None, description="Host settings to pass through to the `Elasticsearch` class."
)

mongo_count_timeout: int = Field(
5,
description="""Number of seconds to allow MongoDB to perform a full database count before falling back to `null`.
This operation can require a full COLLSCAN for empty queries which can be prohibitively slow if the database does not fit into the active set, hence a timeout can drastically speed-up response times.""",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I do not fully understand what you are writing here, but I think MongoDB should know how many entries there are in a collection, so for an empty filter the query should not be slow. For a more complex query for which MongoDB cannot drastically reduce the number of entries using one of the already existing indexes this would still be a useful feature though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, it shouldn't be slow, but previously for an empty filter we were just naively calling count_documents which does still do a full scan that can be very slow. Now, I am using estimated_document_count for this case, which uses simple collection metadata to just return the number. However for filters, it can still make use of this timeout.

)

mongo_database: str = Field(
"optimade", description="Mongo database for collection data"
)
Expand Down
14 changes: 9 additions & 5 deletions optimade/server/entry_collections/entry_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import warnings
from abc import ABC, abstractmethod
from typing import Any, Dict, Iterable, List, Set, Tuple, Type, Union
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union

from lark import Transformer

Expand Down Expand Up @@ -126,7 +126,7 @@ def insert(self, data: List[EntryResource]) -> None:
"""

@abstractmethod
def count(self, **kwargs: Any) -> int:
def count(self, **kwargs: Any) -> Union[int, None]:
"""Returns the number of entries matching the query specified
by the keyword arguments.

Expand All @@ -137,7 +137,7 @@ def count(self, **kwargs: Any) -> int:

def find(
self, params: Union[EntryListingQueryParams, SingleEntryQueryParams]
) -> Tuple[Union[None, Dict, List[Dict]], int, bool, Set[str], Set[str],]:
) -> Tuple[Union[None, Dict, List[Dict]], Optional[int], bool, Set[str], Set[str],]:
"""
Fetches results and indicates if more data is available.

Expand Down Expand Up @@ -203,7 +203,11 @@ def find(
if single_entry:
results = results[0] # type: ignore[assignment]

if CONFIG.validate_api_response and data_returned > 1:
if (
CONFIG.validate_api_response
and data_returned is not None
and data_returned > 1
):
raise NotFound(
detail=f"Instead of a single entry, {data_returned} entries were found",
)
Expand All @@ -221,7 +225,7 @@ def find(
@abstractmethod
def _run_db_query(
self, criteria: Dict[str, Any], single_entry: bool = False
) -> Tuple[List[Dict[str, Any]], int, bool]:
) -> Tuple[List[Dict[str, Any]], Optional[int], bool]:
"""Run the query on the backend and collect the results.

Arguments:
Expand Down
28 changes: 20 additions & 8 deletions optimade/server/entry_collections/mongo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Dict, List, Tuple, Type, Union
from typing import Any, Dict, List, Optional, Tuple, Type, Union

from optimade.filtertransformers.mongo import MongoTransformer
from optimade.models import EntryResource
Expand All @@ -10,6 +10,7 @@

if CONFIG.database_backend.value == "mongodb":
from pymongo import MongoClient, version_tuple
from pymongo.errors import ExecutionTimeout

if version_tuple[0] < 4:
LOGGER.warning(
Expand Down Expand Up @@ -67,9 +68,9 @@
"""Returns the total number of entries in the collection."""
return self.collection.estimated_document_count()

def count(self, **kwargs: Any) -> int:
def count(self, **kwargs: Any) -> Union[int, None]:
"""Returns the number of entries matching the query specified
by the keyword arguments.
by the keyword arguments, or `None` if the count timed out.

Parameters:
**kwargs: Query parameters as keyword arguments. The keys
Expand All @@ -80,9 +81,15 @@
for k in list(kwargs.keys()):
if k not in ("filter", "skip", "limit", "hint", "maxTimeMS"):
del kwargs[k]
if "filter" not in kwargs: # "filter" is needed for count_documents()
kwargs["filter"] = {}
return self.collection.count_documents(**kwargs)
if "filter" not in kwargs:
return self.collection.estimated_document_count()

Check warning on line 85 in optimade/server/entry_collections/mongo.py

View check run for this annotation

Codecov / codecov/patch

optimade/server/entry_collections/mongo.py#L85

Added line #L85 was not covered by tests
else:
if "maxTimeMS" not in kwargs:
kwargs["maxTimeMS"] = 1000 * CONFIG.mongo_count_timeout
try:
return self.collection.count_documents(**kwargs)
except ExecutionTimeout:
return None

Check warning on line 92 in optimade/server/entry_collections/mongo.py

View check run for this annotation

Codecov / codecov/patch

optimade/server/entry_collections/mongo.py#L91-L92

Added lines #L91 - L92 were not covered by tests

def insert(self, data: List[EntryResource]) -> None:
"""Add the given entries to the underlying database.
Expand Down Expand Up @@ -136,7 +143,7 @@

def _run_db_query(
self, criteria: Dict[str, Any], single_entry: bool = False
) -> Tuple[List[Dict[str, Any]], int, bool]:
) -> Tuple[List[Dict[str, Any]], Optional[int], bool]:
"""Run the query on the backend and collect the results.

Arguments:
Expand All @@ -163,7 +170,12 @@
criteria_nolimit.pop("limit", None)
skip = criteria_nolimit.pop("skip", 0)
data_returned = self.count(**criteria_nolimit)
more_data_available = nresults_now + skip < data_returned
# Only correct most of the time: if the total number of remaining results is exactly the page limit
# then this will incorrectly say there is more_data_available
if data_returned is None:
more_data_available = nresults_now == criteria.get("limit", 0)

Check warning on line 176 in optimade/server/entry_collections/mongo.py

View check run for this annotation

Codecov / codecov/patch

optimade/server/entry_collections/mongo.py#L176

Added line #L176 was not covered by tests
else:
more_data_available = nresults_now + skip < data_returned
else:
# SingleEntryQueryParams, e.g., /structures/{entry_id}
data_returned = nresults_now
Expand Down
2 changes: 1 addition & 1 deletion optimade/server/routers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class JSONAPIResponse(JSONResponse):

def meta_values(
url: Union[urllib.parse.ParseResult, urllib.parse.SplitResult, StarletteURL, str],
data_returned: int,
data_returned: Optional[int],
data_available: int,
more_data_available: bool,
schema: Optional[str] = None,
Expand Down
Loading