diff --git a/optimade/server/config.py b/optimade/server/config.py index 29ba11a1f..7b56469c6 100644 --- a/optimade/server/config.py +++ b/optimade/server/config.py @@ -180,6 +180,16 @@ class ServerConfig(BaseSettings): ), ] = None + create_default_index: Annotated[ + bool, + Field( + description=( + "Whether to create a set of default indices " + "for supporting databases after inserting JSONL data." + ) + ), + ] = False + use_real_mongo: Annotated[ bool | None, Field(description="DEPRECATED: force usage of MongoDB over any other backend."), diff --git a/optimade/server/entry_collections/elasticsearch.py b/optimade/server/entry_collections/elasticsearch.py index 329b8d502..5bc52016a 100644 --- a/optimade/server/entry_collections/elasticsearch.py +++ b/optimade/server/entry_collections/elasticsearch.py @@ -48,14 +48,19 @@ def __init__( self.client = client if client else CLIENT self.name = name - # If we are creating a new collection from scratch, also create the index, - # otherwise assume it has already been created externally - if CONFIG.insert_test_data: - self.create_optimade_index() - def count(self, *args, **kwargs) -> int: raise NotImplementedError + def create_default_index(self) -> None: + """Create the default index for the collection. + + For Elastic, the default is to create a search index + over all relevant OPTIMADE fields based on the configured + mapper. + + """ + return self.create_optimade_index() + def create_optimade_index(self) -> None: """Load or create an index that can handle aliased OPTIMADE fields and attach it to the current client. diff --git a/optimade/server/entry_collections/entry_collections.py b/optimade/server/entry_collections/entry_collections.py index 8ab382ecc..caa4593fe 100644 --- a/optimade/server/entry_collections/entry_collections.py +++ b/optimade/server/entry_collections/entry_collections.py @@ -279,16 +279,30 @@ def all_fields(self) -> set[str]: return self._all_fields - def create_index(self, fields: str | set[str], unique: bool = False) -> None: - """Create an index on the given fields. + def create_index(self, field: str, unique: bool = False) -> None: + """Create an index on the given field, as stored in the database. Arguments: - fields: The fields, or single field, to index. + field: The database field to index (i.e., if different from the OPTIMADE field, + the mapper should be used to convert between the two). unique: Whether or not the index should be unique. """ raise NotImplementedError + def create_default_index(self) -> None: + """Create the default index for the collection. + + For example, a database backend could override this method to + create a unique index on the `id` field, so that it can be called + on server startup. + + This method should use a mapper to convert any OPTIMADE field names + to the corresponding stored names in the database. + + """ + raise NotImplementedError + def get_attribute_fields(self) -> set[str]: """Get the set of attribute fields diff --git a/optimade/server/entry_collections/mongo.py b/optimade/server/entry_collections/mongo.py index 4ceb9a9d8..6740f6a52 100644 --- a/optimade/server/entry_collections/mongo.py +++ b/optimade/server/entry_collections/mongo.py @@ -105,18 +105,29 @@ def insert(self, data: list[EntryResource | dict]) -> None: """ self.collection.insert_many(data, ordered=False) - def create_index(self, fields: str | set[str], unique: bool = False) -> None: - """Create an index on the given fields. + def create_index(self, field: str, unique: bool = False) -> None: + """Create an index on the given field, as stored in the database. + + If any error is raised during index creation, this method should faithfully + return it, except for the simple case where an identical index already exists. Arguments: - fields: The fields, or single field, to index. + field: The database field to index (i.e., if different from the OPTIMADE field, + the mapper should be used to convert between the two). unique: Whether or not the index should be unique. """ - if isinstance(fields, str): - fields = {fields} + self.collection.create_index(field, unique=unique, background=True) + + def create_default_index(self) -> None: + """Create the default index for the collection. - self.collection.create_indexes(fields, unique=unique, background=True) + For MongoDB, the default is to create a unique index + on the `id` field. This method should obey any configured + mappers. + + """ + self.create_index(self.resource_mapper.get_backend_field("id"), unique=True) def handle_query_params( self, params: EntryListingQueryParams | SingleEntryQueryParams diff --git a/optimade/server/main.py b/optimade/server/main.py index f563f3655..74f770ea0 100644 --- a/optimade/server/main.py +++ b/optimade/server/main.py @@ -123,7 +123,7 @@ def load_entries(endpoint_name: str, endpoint_collection: EntryCollection): f"Requested JSONL file does not exist: {jsonl_path}. Please specify an absolute group." ) - insert_from_jsonl(jsonl_path) + insert_from_jsonl(jsonl_path, create_default_index=CONFIG.create_default_index) LOGGER.debug("Inserted data from JSONL file: %s", jsonl_path) if CONFIG.insert_test_data: diff --git a/optimade/utils.py b/optimade/utils.py index 2cf53c2c8..306bb327f 100644 --- a/optimade/utils.py +++ b/optimade/utils.py @@ -24,11 +24,12 @@ ) -def insert_from_jsonl(jsonl_path: Path) -> None: +def insert_from_jsonl(jsonl_path: Path, create_default_index: bool = False) -> None: """Insert OPTIMADE JSON lines data into the database. Arguments: jsonl_path: Path to the JSON lines file. + create_default_index: Whether to create a default index on the `id` field. """ from collections import defaultdict @@ -50,6 +51,14 @@ def insert_from_jsonl(jsonl_path: Path) -> None: ) jsonl_path = _jsonl_path + # If the chosen database backend supports it, make the default indices + if create_default_index: + for entry_type in ENTRY_COLLECTIONS: + try: + ENTRY_COLLECTIONS[entry_type].create_default_index() + except NotImplementedError: + pass + bad_rows: int = 0 good_rows: int = 0 with open(jsonl_path) as handle: diff --git a/tests/server/entry_collections/test_indexes.py b/tests/server/entry_collections/test_indexes.py new file mode 100644 index 000000000..006bf2e85 --- /dev/null +++ b/tests/server/entry_collections/test_indexes.py @@ -0,0 +1,40 @@ +import pytest +from bson import ObjectId + +from optimade.server.config import CONFIG + + +@pytest.mark.skipif( + CONFIG.database_backend.value not in ("mongomock", "mongodb"), + reason="Skipping index test when testing the elasticsearch backend.", +) +def test_indexes_are_created_where_appropriate(client): + """Test that with the test config, default indices are made by + supported backends. This is tested by checking that we cannot insert + an entry with the same underlying ID as the test data, and that this + returns the appopriate database-specific error. + + """ + import pymongo.errors + + from optimade.server.query_params import EntryListingQueryParams + from optimade.server.routers import ENTRY_COLLECTIONS + + # get one structure with and try to reinsert it + for _type in ENTRY_COLLECTIONS: + result, _, _, _, _ = ENTRY_COLLECTIONS[_type].find( + EntryListingQueryParams(page_limit=1) + ) + assert result is not None + if isinstance(result, list): + result = result[0] + + # The ID is mapped to the test data ID (e.g., 'task_id'), so the index is actually on that + id_field = ENTRY_COLLECTIONS[_type].resource_mapper.get_backend_field("id") + + # Take the raw database result, extract the OPTIMADE ID and try to insert a canary + # document containing just that ID, plus a fake MongoDB ID to avoid '_id' clashes + canary = {id_field: result["id"], "_id": ObjectId(24 * "0")} + # Match either for "Duplicate" (mongomock) or "duplicate" (mongodb) + with pytest.raises(pymongo.errors.BulkWriteError, match="uplicate"): + ENTRY_COLLECTIONS[_type].insert([canary]) # type: ignore diff --git a/tests/test_config.json b/tests/test_config.json index b011371a7..43787f4ea 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -17,6 +17,7 @@ "mongo_count_timeout": 0, "index_base_url": "http://localhost:5001", "insert_from_jsonl": "optimade/server/data/test_data.jsonl", + "create_default_index": true, "provider_fields": { "structures": [ "band_gap",