langchain-ai · maxjakob · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml
@@ -0,0 +1,57 @@
+name: integration-test
+
+on:
+  workflow_call:
+    inputs:
+      working-directory:
+        required: true
+        type: string
+        description: "From which folder this pipeline executes"
+
+env:
+  POETRY_VERSION: "1.7.1"
+  DOCKER_COMPOSE_YAML: "libs/elasticsearch/integration_tests/docker-compose.yml"
+
+jobs:
+  build:
+    defaults:
+      run:
+        working-directory: ${{ inputs.working-directory }}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+    name: "Integration tests"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
+        uses: "./.github/actions/poetry_setup"
+        with:
+          python-version: ${{ matrix.python-version }}
+          poetry-version: ${{ env.POETRY_VERSION }}
+          working-directory: ${{ inputs.working-directory }}
+          cache-key: integration-test
+
+      - name: Install dependencies
+        shell: bash
+        run: poetry install --with=test_integration,test
+
+      - name: Start containers
+        shell: bash
+        run: docker-compose -f "$DOCKER_COMPOSE_YAML" up elasticsearch -d --build
+
+      - name: Run integration tests
+        shell: bash
+        env:
+          OPENAI_API_KEY: ${{ secrets.SuperSecret }}
+        run: make integration_test
+
+      - name: Stop containers
+        if: always()
+        shell: bash
+        run: docker-compose -f "$DOCKER_COMPOSE_YAML" down elasticsearch
diff --git a/.github/workflows/check_diffs.yml b/.github/workflows/check_diffs.yml
@@ -69,7 +69,7 @@ jobs:
     uses: ./.github/workflows/_compile_integration_test.yml
     with:
       working-directory: ${{ matrix.working-directory }}
-    secrets: inherit
+
   ci_success:
     name: "CI Success"
     needs: [build, lint, test, compile-integration-tests]
@@ -87,3 +87,16 @@ jobs:
           echo $RESULTS_JSON
           echo "Exiting with $EXIT_CODE"
           exit $EXIT_CODE
+    secrets: inherit
+
+  # integration-test:
+  #   name: cd ${{ matrix.working-directory }}
+  #   needs: [ compile-interation-tests ]
+  #   if: ${{ needs.build.outputs.dirs-to-test != '[]' }}
+  #   strategy:
+  #     matrix:
+  #       working-directory: ${{ fromJson(needs.build.outputs.dirs-to-test) }}
+  #   uses: ./.github/workflows/_integration_test.yml
+  #   with:
+  #     working-directory: ${{ matrix.working-directory }}
+  #   secrets: inherit
diff --git a/libs/elasticsearch/tests/integration_tests/_test_utilities.py b/libs/elasticsearch/tests/integration_tests/_test_utilities.py
@@ -1,8 +1,11 @@
 import os
-from typing import Any, Dict, List
+import time
+from typing import Any, Dict, List, Optional
 
 from elastic_transport import Transport
-from elasticsearch import Elasticsearch
+from elasticsearch import Elasticsearch, NotFoundError
+
+from langchain_elasticsearch._utilities import check_if_model_deployed
 
 
 def clear_test_indices(es: Elasticsearch) -> None:
@@ -40,3 +43,35 @@ def perform_request(self, *args, **kwargs):  # type: ignore
         es = Elasticsearch(hosts=[es_url], transport_class=CustomTransport)
 
     return es
+
+
+def deploy_model(
+    es_client: Elasticsearch,
+    model_id: str = ".elser_model_2",
+    field: str = "text_field",
+    model_type: Optional[str] = None,
+    inference_config: Optional[Dict] = None,
+) -> None:
+    try:
+        check_if_model_deployed(es_client, model_id)
+    except NotFoundError:
+        # download the model
+        es_client.ml.put_trained_model(
+            model_id=model_id,
+            input={"field_names": [field]},
+            model_type=model_type,
+            inference_config=inference_config,
+        )
+
+        # wait until ready
+        while True:
+            status = es_client.ml.get_trained_models(
+                model_id=model_id, include="definition_status"
+            )
+            if status["trained_model_configs"][0]["fully_defined"]:
+                # model is ready
+                break
+            time.sleep(1)
+
+        # deploy the model
+        es_client.ml.start_trained_model_deployment(model_id=model_id)
diff --git a/libs/elasticsearch/tests/integration_tests/test_embeddings.py b/libs/elasticsearch/tests/integration_tests/test_embeddings.py
@@ -1,48 +1,38 @@
 """Test elasticsearch_embeddings embeddings."""
 
-import pytest
-from langchain_core.utils import get_from_env
+import os
+
+from elasticsearch import Elasticsearch
 
 from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings
 
-# deployed with
-# https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-text-emb-vector-search-example.html
-DEFAULT_MODEL = "sentence-transformers__msmarco-minilm-l-12-v3"
-DEFAULT_NUM_DIMENSIONS = "384"
-
-
-@pytest.fixture
-def model_id() -> str:
-    return get_from_env("model_id", "MODEL_ID", DEFAULT_MODEL)
-
-
-@pytest.fixture
-def expected_num_dimensions() -> int:
-    return int(
-        get_from_env(
-            "expected_num_dimensions", "EXPECTED_NUM_DIMENSIONS", DEFAULT_NUM_DIMENSIONS
-        )
-    )
-
-
-def test_elasticsearch_embedding_documents(
-    model_id: str, expected_num_dimensions: int
-) -> None:
-    """Test Elasticsearch embedding documents."""
-    documents = ["foo bar", "bar foo", "foo"]
-    embedding = ElasticsearchEmbeddings.from_credentials(model_id)
-    output = embedding.embed_documents(documents)
-    assert len(output) == 3
-    assert len(output[0]) == expected_num_dimensions
-    assert len(output[1]) == expected_num_dimensions
-    assert len(output[2]) == expected_num_dimensions
-
-
-def test_elasticsearch_embedding_query(
-    model_id: str, expected_num_dimensions: int
-) -> None:
-    """Test Elasticsearch embedding query."""
-    document = "foo bar"
-    embedding = ElasticsearchEmbeddings.from_credentials(model_id)
-    output = embedding.embed_query(document)
-    assert len(output) == expected_num_dimensions
+from ._test_utilities import deploy_model
+
+ES_CLIENT = Elasticsearch(hosts=[os.environ.get("ES_URL", "http://localhost:9200")])
+MODEL_ID = ".elser_model_2"
+
+
+class TestEmbeddings:
+    @classmethod
+    def setup_class(cls) -> None:
+        deploy_model(ES_CLIENT, MODEL_ID)
+
+    def test_elasticsearch_embedding_documents(self) -> None:
+        """Test Elasticsearch embedding documents."""
+        documents = ["foo bar", "bar foo", "foo"]
+        embedding = ElasticsearchEmbeddings(ES_CLIENT.ml, MODEL_ID)
+        output = embedding.embed_documents(documents)
+        assert len(output) == 3
+        assert "foo" in output[0]
+        assert "##bar" in output[0]
+        assert "bar" in output[1]
+        assert "foo" in output[1]
+        assert "foo" in output[2]
+
+    def test_elasticsearch_embedding_query(self) -> None:
+        """Test Elasticsearch embedding query."""
+        document = "foo bar"
+        embedding = ElasticsearchEmbeddings(ES_CLIENT.ml, MODEL_ID)
+        output = embedding.embed_query(document)
+        assert "foo" in output
+        assert "##bar" in output
diff --git a/libs/elasticsearch/tests/integration_tests/test_vectorstores.py b/libs/elasticsearch/tests/integration_tests/test_vectorstores.py
@@ -17,7 +17,7 @@
     ConsistentFakeEmbeddings,
     FakeEmbeddings,
 )
-from ._test_utilities import clear_test_indices, requests_saving_es_client
+from ._test_utilities import clear_test_indices, deploy_model, requests_saving_es_client
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -40,17 +40,11 @@
 """
 
 modelsDeployed: List[str] = [
-    # ".elser_model_1",
     # "sentence-transformers__all-minilm-l6-v2",
 ]
 
 
 class TestElasticsearch:
-    @classmethod
-    def setup_class(cls) -> None:
-        if not os.getenv("OPENAI_API_KEY"):
-            raise ValueError("OPENAI_API_KEY environment variable is not set")
-
     @pytest.fixture(scope="class", autouse=True)
     def elasticsearch_connection(self) -> Union[dict, Generator[dict, None, None]]:
         es_url = os.environ.get("ES_URL", "http://localhost:9200")
@@ -708,20 +702,23 @@ def assert_query(query_body: dict, query: str) -> dict:
         output = docsearch.similarity_search("bar", k=1)
         assert output == [Document(page_content="bar")]
 
-    @pytest.mark.skipif(
-        ".elser_model_1" not in modelsDeployed,
-        reason="ELSER not deployed in ML Node, skipping test",
-    )
     def test_similarity_search_with_sparse_infer_instack(
         self, elasticsearch_connection: dict, index_name: str
     ) -> None:
         """test end to end with sparse retrieval strategy and inference in-stack"""
+        model_id = ".elser_model_2"
+
+        es_client = ElasticsearchStore.connect_to_elasticsearch(
+            **elasticsearch_connection
+        )
+        deploy_model(es_client, model_id)
+
         texts = ["foo", "bar", "baz"]
         docsearch = ElasticsearchStore.from_texts(
             texts,
             **elasticsearch_connection,
             index_name=index_name,
-            strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),
+            strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(model_id),
         )
         output = docsearch.similarity_search("foo", k=1)
         assert output == [Document(page_content="foo")]