Skip to content

Commit

Permalink
Pinecone vector store (#1135)
Browse files Browse the repository at this point in the history
Added support for Pinecone in EvaDB. Since pinecone requires an API key,
created an evironment variable in evadb yml file. If not set, we check
the os environment.
  • Loading branch information
Chitti-Ankith authored Sep 21, 2023
1 parent 443510a commit 952d110
Show file tree
Hide file tree
Showing 13 changed files with 193 additions and 7 deletions.
4 changes: 2 additions & 2 deletions docs/source/dev-guide/contribute/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ If you want to run a specific test file, use the following command.

.. code-block:: bash
PYTHONPATH="." python -m pytest test/integration_tests/test_select_executor.py
PYTHONPATH="." python -m pytest test/integration_tests/long/test_select_executor.py
Use the following command to run a specific test case within a specific test file.

.. code-block:: bash
PYTHONPATH="." python -m pytest test/integration_tests/test_select_executor.py -k 'test_should_load_and_select_in_table'
PYTHONPATH="." python -m pytest test/integration_tests/long/test_select_executor.py -k 'test_should_load_and_select_in_table'
1 change: 1 addition & 0 deletions evadb/catalog/catalog_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def to_numpy_type(cls, t):
class VectorStoreType(EvaDBEnum):
FAISS # noqa: F821
QDRANT # noqa: F821
PINECONE # noqa: F821


class VideoColumnName(EvaDBEnum):
Expand Down
2 changes: 2 additions & 0 deletions evadb/evadb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ experimental:

third_party:
OPENAI_KEY: ""
PINECONE_API_KEY: ""
PINECONE_ENV: ""
2 changes: 2 additions & 0 deletions evadb/executor/executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def handle_vector_store_params(
return {"index_path": index_path}
elif vector_store_type == VectorStoreType.QDRANT:
return {"index_db": str(Path(index_path).parent)}
elif vector_store_type == VectorStoreType.PINECONE:
return {}
else:
raise ValueError("Unsupported vector store type: {}".format(vector_store_type))

Expand Down
2 changes: 1 addition & 1 deletion evadb/interfaces/relational/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def create_vector_index(
index_name (str): Name of the index.
table_name (str): Name of the table.
expr (str): Expression used to build the vector index.
using (str): Method used for indexing, can be `FAISS` or `QDRANT`.
using (str): Method used for indexing, can be `FAISS` or `QDRANT` or `PINECONE`.
Returns:
EvaDBCursor: The EvaDBCursor object.
Expand Down
3 changes: 2 additions & 1 deletion evadb/parser/evadb.lark
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ function_metadata_key: uid

function_metadata_value: string_literal | decimal_literal

vector_store_type: USING (FAISS | QDRANT)
vector_store_type: USING (FAISS | QDRANT | PINECONE)

index_elem: ("(" uid_list ")"
| "(" function_call ")")
Expand Down Expand Up @@ -415,6 +415,7 @@ PDF: "PDF"i
HNSW: "HNSW"i
FAISS: "FAISS"i
QDRANT: "QDRANT"i
PINECONE: "PINECONE"i

// Computer vision tasks

Expand Down
2 changes: 2 additions & 0 deletions evadb/parser/lark_visitor/_create_statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ def vector_store_type(self, tree):
vector_store_type = VectorStoreType.FAISS
elif str.upper(token) == "QDRANT":
vector_store_type = VectorStoreType.QDRANT
elif str.upper(token) == "PINECONE":
vector_store_type = VectorStoreType.PINECONE
return vector_store_type

# INDEX CREATION
Expand Down
111 changes: 111 additions & 0 deletions evadb/third_party/vector_stores/pinecone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import List

from evadb.configuration.configuration_manager import ConfigurationManager
from evadb.third_party.vector_stores.types import (
FeaturePayload,
VectorIndexQuery,
VectorIndexQueryResult,
VectorStore,
)
from evadb.utils.generic_utils import try_to_import_pinecone_client
from evadb.utils.logging_manager import logger

required_params = []
_pinecone_init_done = False


class PineconeVectorStore(VectorStore):
def __init__(self, index_name: str) -> None:
try_to_import_pinecone_client()
global _pinecone_init_done
# pinecone only allows index names with lower alpha-numeric characters and '-'
self._index_name = index_name.strip().lower()

# Get the API key.
self._api_key = ConfigurationManager().get_value(
"third_party", "PINECONE_API_KEY"
)

if not self._api_key:
self._api_key = os.environ.get("PINECONE_API_KEY")

assert (
self._api_key
), "Please set your Pinecone API key in evadb.yml file (third_party, pinecone_api_key) or environment variable (PINECONE_KEY)"

# Get the environment name.
self._environment = ConfigurationManager().get_value(
"third_party", "PINECONE_ENV"
)
if not self._environment:
self._environment = os.environ.get("PINECONE_ENV")

assert (
self._environment
), "Please set the Pinecone environment key in evadb.yml file (third_party, pinecone_env) or environment variable (PINECONE_ENV)"

if not _pinecone_init_done:
# Initialize pinecone.
import pinecone

pinecone.init(api_key=self._api_key, environment=self._environment)
_pinecone_init_done = True
self._client = None

def create(self, vector_dim: int):
import pinecone

pinecone.create_index(self._index_name, dimension=vector_dim, metric="cosine")
logger.warning(
f"""Created index {self._index_name}. Please note that Pinecone is eventually consistent, hence any additions to the Vector Index may not get immediately reflected in queries."""
)
self._client = pinecone.Index(self._index_name)

def add(self, payload: List[FeaturePayload]):
self._client.upsert(
vectors=[
{"id": str(row.id), "values": row.embedding.reshape(-1).tolist()}
for row in payload
]
)

def delete(self) -> None:
import pinecone

pinecone.delete_index(self._index_name)

def query(
self,
query: VectorIndexQuery,
) -> VectorIndexQueryResult:
import pinecone

if not self._client:
self._client = pinecone.Index(self._index_name)

response = self._client.query(
top_k=query.top_k, vector=query.embedding.reshape(-1).tolist()
)

distances, ids = [], []

for row in response["matches"]:
distances.append(row["score"])
ids.append(int(row["id"]))

return VectorIndexQueryResult(distances, ids)
7 changes: 7 additions & 0 deletions evadb/third_party/vector_stores/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
from evadb.catalog.catalog_type import VectorStoreType
from evadb.third_party.vector_stores.faiss import FaissVectorStore
from evadb.third_party.vector_stores.pinecone import PineconeVectorStore
from evadb.third_party.vector_stores.qdrant import QdrantVectorStore
from evadb.utils.generic_utils import validate_kwargs

Expand All @@ -35,5 +36,11 @@ def init_vector_store(
validate_kwargs(kwargs, required_params, required_params)
return QdrantVectorStore(index_name, **kwargs)

elif vector_store_type == VectorStoreType.PINECONE:
from evadb.third_party.vector_stores.pinecone import required_params

validate_kwargs(kwargs, required_params, required_params)
return PineconeVectorStore(index_name, **kwargs)

else:
raise Exception(f"Vector store {vector_store_type} not supported")
10 changes: 10 additions & 0 deletions evadb/utils/generic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,16 @@ def try_to_import_qdrant_client():
)


def try_to_import_pinecone_client():
try:
import pinecone # noqa: F401
except ImportError:
raise ValueError(
"""Could not import pinecone_client python package.
Please install it with 'pip install pinecone_client`."""
)


def is_qdrant_available() -> bool:
try:
try_to_import_qdrant_client()
Expand Down
2 changes: 2 additions & 0 deletions script/formatting/spelling.txt
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,8 @@ ParserStatementTests
ParserTests
PermissionDenied
PickleSerializer
PINECONE
Pinecone
PlanExecutor
PlanExecutorTest
PlanGenerator
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def read(path, encoding="utf-8"):

qdrant_libs = ["qdrant_client"] # cannot install on 3.11 due to grcpio

pinecone_libs = ["pinecone-client"]

postgres_libs = [
"psycopg2",
]
Expand Down
52 changes: 49 additions & 3 deletions test/integration_tests/long/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import unittest
from test.markers import gpu_skip_marker, qdrant_skip_marker
from test.markers import qdrant_skip_marker
from test.util import (
create_sample_image,
get_evadb_for_testing,
Expand Down Expand Up @@ -116,6 +117,13 @@ def setUp(self):

base_img -= 1

# Set the env variables.
self.original_pinecone_key = os.environ.get("PINECONE_API_KEY")
self.original_pinecone_env = os.environ.get("PINECONE_ENV")

os.environ["PINECONE_API_KEY"] = "657e4fae-7208-4555-b0f2-9847dfa5b818"
os.environ["PINECONE_ENV"] = "gcp-starter"

def tearDown(self):
shutdown_ray()

Expand All @@ -125,6 +133,15 @@ def tearDown(self):
execute_query_fetch_all(self.evadb, drop_table_query)
drop_table_query = "DROP TABLE IF EXISTS testSimilarityImageDataset;"
execute_query_fetch_all(self.evadb, drop_table_query)
# Reset the env variables.
if self.original_pinecone_key:
os.environ["PINECONE_API_KEY"] = self.original_pinecone_key
else:
del os.environ["PINECONE_API_KEY"]
if self.original_pinecone_env:
os.environ["PINECONE_ENV"] = self.original_pinecone_env
else:
del os.environ["PINECONE_ENV"]

def test_similarity_should_work_in_order(self):
###############################################
Expand Down Expand Up @@ -368,10 +385,12 @@ def test_end_to_end_index_scan_should_work_correctly_on_image_dataset(self):
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)

@gpu_skip_marker
# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexImageDataset")

@qdrant_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_qdrant(self):
create_index_query = """CREATE INDEX testFaissIndexImageDataset
create_index_query = """CREATE INDEX testQdrantIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING QDRANT;"""
execute_query_fetch_all(self.evadb, create_index_query)
Expand All @@ -388,3 +407,30 @@ def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_qdrant(sel

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testQdrantIndexImageDataset")

def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_pinecone(
self,
):
# We need to always drop the index as Pinecone's free tier only supports a single current index.
drop_index_query = "DROP INDEX IF EXISTS testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)
create_index_query = """CREATE INDEX testpineconeindeximagedataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING PINECONE;"""
execute_query_fetch_all(self.evadb, create_index_query)
# Sleep to ensure the pinecone records get updated as Pinecone is eventually consistent.
time.sleep(20)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
drop_index_query = "DROP INDEX testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)

0 comments on commit 952d110

Please sign in to comment.