Skip to content

Commit

Permalink
fix: 1199 enhance similarity search test to make index is used (#1212)
Browse files Browse the repository at this point in the history
* Drop index has some bugs on newly added indexes
* Improve msg
* Enhance test cases
  • Loading branch information
jiashenC authored Sep 25, 2023
1 parent 6d74501 commit 09d6434
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 68 deletions.
6 changes: 5 additions & 1 deletion evadb/catalog/services/index_catalog_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,11 @@ def delete_entry_by_name(self, name: str):
index_metadata = index_obj.as_dataclass()
# clean up the on disk data
if os.path.exists(index_metadata.save_file_path):
os.remove(index_metadata.save_file_path)
if os.path.isfile(index_metadata.save_file_path):
# For service-hosting-based vector database, we should not
# touch their base directory. The only case that needs to
# be taken care of is FAISS index local disk file.
os.remove(index_metadata.save_file_path)
index_obj.delete(self.session)
except Exception:
logger.exception("Delete index failed for name {}".format(name))
Expand Down
2 changes: 1 addition & 1 deletion evadb/executor/executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def handle_vector_store_params(
elif vector_store_type == VectorStoreType.QDRANT:
return {"index_db": str(Path(index_path).parent)}
elif vector_store_type == VectorStoreType.CHROMADB:
return {"index_path": index_path}
return {"index_path": str(Path(index_path).parent)}
elif vector_store_type == VectorStoreType.PINECONE:
return {}
else:
Expand Down
4 changes: 2 additions & 2 deletions evadb/third_party/vector_stores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, index_name: str) -> None:

assert (
self._api_key
), "Please set your Pinecone API key in evadb.yml file (third_party, pinecone_api_key) or environment variable (PINECONE_KEY)"
), "Please set your Pinecone API key in evadb.yml file (third_party, pinecone_api_key) or environment variable (PINECONE_KEY). It can be found at Pinecone Dashboard > API Keys > Value"

# Get the environment name.
self._environment = ConfigurationManager().get_value(
Expand All @@ -57,7 +57,7 @@ def __init__(self, index_name: str) -> None:

assert (
self._environment
), "Please set the Pinecone environment key in evadb.yml file (third_party, pinecone_env) or environment variable (PINECONE_ENV)"
), "Please set the Pinecone environment key in evadb.yml file (third_party, pinecone_env) or environment variable (PINECONE_ENV). It can be found Pinecone Dashboard > API Keys > Environment."

if not _pinecone_init_done:
# Initialize pinecone.
Expand Down
167 changes: 103 additions & 64 deletions test/integration_tests/long/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,10 @@ def test_should_do_vector_index_scan(self):
)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite1")
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite2")
drop_query = "DROP INDEX testFaissIndexScanRewrite1"
execute_query_fetch_all(self.evadb, drop_query)
drop_query = "DROP INDEX testFaissIndexScanRewrite2"
execute_query_fetch_all(self.evadb, drop_query)

def test_should_not_do_vector_index_scan_with_desc_order(self):
# Execution with index scan.
Expand Down Expand Up @@ -347,7 +349,8 @@ def test_should_not_do_vector_index_scan_with_desc_order(self):
self.assertTrue(np.array_equal(actual_open, base_img + 3))

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite")
drop_query = "DROP INDEX testFaissIndexScanRewrite"
execute_query_fetch_all(self.evadb, drop_query)

def test_should_not_do_vector_index_scan_with_predicate(self):
# Execution with index scan.
Expand All @@ -370,86 +373,122 @@ def test_should_not_do_vector_index_scan_with_predicate(self):
self.assertFalse("FaissIndexScan" in batch.frames[0][0])

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite")
drop_query = "DROP INDEX testFaissIndexScanRewrite"
execute_query_fetch_all(self.evadb, drop_query)

def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_faiss(self):
for _ in range(2):
create_index_query = """CREATE INDEX testFaissIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING FAISS;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

def test_end_to_end_index_scan_should_work_correctly_on_image_dataset(self):
create_index_query = """CREATE INDEX testFaissIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING FAISS;"""
execute_query_fetch_all(self.evadb, create_index_query)
select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexImageDataset")
# Cleanup
drop_query = "DROP INDEX testFaissIndexImageDataset"
execute_query_fetch_all(self.evadb, drop_query)

@qdrant_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_qdrant(self):
create_index_query = """CREATE INDEX testQdrantIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING QDRANT;"""
execute_query_fetch_all(self.evadb, create_index_query)
select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
for _ in range(2):
create_index_query = """CREATE INDEX testQdrantIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING QDRANT;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

"""|__ ProjectPlan
|__ VectorIndexScanPlan
|__ SeqScanPlan
|__ StoragePlan"""
"""|__ ProjectPlan
|__ VectorIndexScanPlan
|__ SeqScanPlan
|__ StoragePlan"""

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testQdrantIndexImageDataset")
# Cleanup
drop_query = "DROP INDEX testQdrantIndexImageDataset"
execute_query_fetch_all(self.evadb, drop_query)

@chromadb_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_chromadb(
self,
):
create_index_query = """CREATE INDEX testChromaDBIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING CHROMADB;"""
execute_query_fetch_all(self.evadb, create_index_query)
for _ in range(2):
create_index_query = """CREATE INDEX testChromaDBIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING CHROMADB;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
# Cleanup
drop_query = "DROP INDEX testChromaDBIndexImageDataset"
execute_query_fetch_all(self.evadb, drop_query)

@pinecone_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_pinecone(
self,
):
# We need to always drop the index as Pinecone's free tier only supports a single current index.
drop_index_query = "DROP INDEX IF EXISTS testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)
create_index_query = """CREATE INDEX testpineconeindeximagedataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING PINECONE;"""
execute_query_fetch_all(self.evadb, create_index_query)
# Sleep to ensure the pinecone records get updated as Pinecone is eventually consistent.
time.sleep(20)
for _ in range(2):
create_index_query = """CREATE INDEX testpineconeindeximagedataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING PINECONE;"""
execute_query_fetch_all(self.evadb, create_index_query)
# Sleep to ensure the pinecone records get updated as Pinecone is eventually consistent.
time.sleep(20)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
drop_index_query = "DROP INDEX testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)
drop_index_query = "DROP INDEX testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)

0 comments on commit 09d6434

Please sign in to comment.