From 1a149079d612121ee9aca4ca306fa179f94b901c Mon Sep 17 00:00:00 2001 From: Max Jakob Date: Wed, 8 May 2024 12:03:28 +0200 Subject: [PATCH] bring back previous integration tests --- libs/elasticsearch/poetry.lock | 118 ++- libs/elasticsearch/pyproject.toml | 2 +- .../integration_tests/test_vectorstores.py | 752 +++++++++++++++++- 3 files changed, 869 insertions(+), 3 deletions(-) diff --git a/libs/elasticsearch/poetry.lock b/libs/elasticsearch/poetry.lock index e9ade8e..eeaa8c2 100644 --- a/libs/elasticsearch/poetry.lock +++ b/libs/elasticsearch/poetry.lock @@ -338,6 +338,8 @@ files = [ [package.dependencies] elastic-transport = ">=8.13,<9" +numpy = {version = ">=1", optional = true, markers = "extra == \"vectorstore-mmr\""} +simsimd = {version = ">=3", optional = true, markers = "extra == \"vectorstore-mmr\""} [package.extras] async = ["aiohttp (>=3,<4)"] @@ -1307,6 +1309,120 @@ files = [ {file = "ruff-0.1.15.tar.gz", hash = "sha256:f6dfa8c1b21c913c326919056c390966648b680966febcb796cc9d1aaab8564e"}, ] +[[package]] +name = "simsimd" +version = "4.3.1" +description = "Fastest SIMD-Accelerated Vector Similarity Functions for x86 and Arm" +optional = false +python-versions = "*" +files = [ + {file = "simsimd-4.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17e92168fd00f8721c182a248688f9df415222cc90a12ff2c834b814cc248f64"}, + {file = "simsimd-4.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d3b251e7b4df35252eb9fd1c885885647102e533277f1e526313fd29c2fcf845"}, + {file = "simsimd-4.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1222bf0500af4eacff2215c90c8309466a380c4f73b51e21cccc6fe4aae71284"}, + {file = "simsimd-4.3.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:806ba354a323f3d52869d26fb27027a1722091135e77e942fd09de91fca107c4"}, + {file = "simsimd-4.3.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ad8f5bcbaa6e3607f6b7770197d89413f789841558b116017eb3670ce2ba9ae"}, + {file = "simsimd-4.3.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d8ecec183a3826c4dadf1838323231d8d72f5c8f16e3e97e25a157e8a0b252a9"}, + {file = "simsimd-4.3.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ce0d88025b9e7f20ee8246123a67885a20228bc935755e4b5154d5a0b292fb3"}, + {file = "simsimd-4.3.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:99ff0aff394a0f4798909325700c74d06629d1e08cea619cfb2b66c2b8f10dfa"}, + {file = "simsimd-4.3.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:101d1a08b4976ec69b5a6ccfc92a793034071997ac84eab2c0a6d9a16c8e8a1c"}, + {file = "simsimd-4.3.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a53656ecc3392b59a62bd5699c6c3b3a151a9b8a0d43d29857ba1412980fc4d3"}, + {file = "simsimd-4.3.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:b4051b0144ddf170cda17eeb86bc4db64fa0163d751fb732108b0fb875932f4d"}, + {file = "simsimd-4.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418eeadb62b67b20d765b44957a03914e6c96ff0cd7cb909d91d23052eb5951e"}, + {file = "simsimd-4.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc3a034cc9b8e430416d3705ff450735f5bea05bbd6736fe5be314bd65ebc4f5"}, + {file = "simsimd-4.3.1-cp310-cp310-win32.whl", hash = "sha256:d86956438b00b2b53b12a3221c40d8e099d72f15d267f1d8e00de6d2234a0be5"}, + {file = "simsimd-4.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:a6568c32c5e339470071d4382a9a125fac2ff871ca925bbf5bd0cd122caac791"}, + {file = "simsimd-4.3.1-cp310-cp310-win_arm64.whl", hash = "sha256:5447a92ea7d6050f51c10ed5a87384dcc55134d70f0cf5334059a8c43fefcd87"}, + {file = "simsimd-4.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:561327a2f2d554fb2622cf7b70176687745f3a3c7cdc82184c1776376fe5ed63"}, + {file = "simsimd-4.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7be739582b59625046644297ee93ef53b036fffff8765ffcb14ee250e1c8eaca"}, + {file = "simsimd-4.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:38c7cb08a67f3e2841188c67e6964c57b025608174296db63de78d9fb6046eb8"}, + {file = "simsimd-4.3.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25697848d40da92e4a1c63587643db7c81215b36028580ac5664730f7911bcaa"}, + {file = "simsimd-4.3.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d437459498158da62b12bcbb41ce5615277fda9b48e9c7e53566109ad407544"}, + {file = "simsimd-4.3.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1ea3363429a238541c7c56490f584403e408e319e2414ff1c2354302cc6b27da"}, + {file = "simsimd-4.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7b9480f38eedb557d3ece7dff176b16d886ae579ebcba6d6d6bf97789ea35205"}, + {file = "simsimd-4.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:773cbf1c3e5f8f77d3171051255b5d10a3add7dcf7ec00e2efda587350b72e72"}, + {file = "simsimd-4.3.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:35418cbf47af8ee17ea98be438c89629f52030f26eb18801ff7f6de4037203c3"}, + {file = "simsimd-4.3.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:37eb8141f569907e26067dc03464fad6c12d59d4fc12389ebda89aafca4402e8"}, + {file = "simsimd-4.3.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:876f9b334dd60334e99f3ecfa6f02b54008d75c34aca9990cbf35aeca7b57c53"}, + {file = "simsimd-4.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d01454046ffa10ab45876372e8924fe7cdd61e54993f99db2b0c3ba8a8548701"}, + {file = "simsimd-4.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:abc11653df3134dcb6c5526854cc545243ba71d4826dc5bcbef22502929a0146"}, + {file = "simsimd-4.3.1-cp311-cp311-win32.whl", hash = "sha256:46d5c7c5c1a1fb29a02c722f0c1e40068c40aa0e86a0b815f13ba628763fa1a5"}, + {file = "simsimd-4.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:75a8fd16ed3741ed5d3b43148eb9220b5d8ee505c118e3efcd3cfdde5814df96"}, + {file = "simsimd-4.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:d31375e088f849e6380c3f9c460a5c670ab6d50244715200049b1772588da424"}, + {file = "simsimd-4.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3a35be5e13c8205b109a46f9918fd230e93f8ac4daad5909f4c890198322aeb5"}, + {file = "simsimd-4.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fbace3dcc94e92d3b9336f6837675c537ca37e9b89ceabf4baf354fc515b0af3"}, + {file = "simsimd-4.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2400b0c6406d4170fbe400c717128d74adabacf0431bbe9bdfde82eb1b507ba3"}, + {file = "simsimd-4.3.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1740b6e09a630780091416d7a478bc33b4bdf528119a18dd115d78c600eb1ae"}, + {file = "simsimd-4.3.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b09bb68cdf1d6395200c64a4b497ec171b6b8c0df07f6f132c932879b3bedf3"}, + {file = "simsimd-4.3.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:842d4f1c3aaa8057d4aa66c5e4528a2c0269dfea691318f3781da3bc60204d4b"}, + {file = "simsimd-4.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e44fd207e61bf412a61249471ed7c70b9dc460960fd0b1013de210a484369283"}, + {file = "simsimd-4.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:cbeb62a0c65a8e33c6de06f087706afa81786e63d07748fea6e655390c03ec24"}, + {file = "simsimd-4.3.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:27e61b200ce65e279e206eb0512810f49e58a6f3ff79213e29c72f88cfd45238"}, + {file = "simsimd-4.3.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:4b0cbea79443bc567937e3e05cc564e51d2eb5cf6373b9b65ded9ad8aef26fa5"}, + {file = "simsimd-4.3.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:30883e069590ea717c6b616662e74a6a88fe7978e9831422c3470f314c1fc635"}, + {file = "simsimd-4.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d6ba76dda2cdfcadce36048086bf80e3702c5fe527cc05e741adbe0d6c9cb199"}, + {file = "simsimd-4.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:487df95563ec31aa69c3023f8efecdd62511f4fe1b91f264154d66df1202bfff"}, + {file = "simsimd-4.3.1-cp312-cp312-win32.whl", hash = "sha256:b781486486851e29e7034151e1d8ce7994fbcfba7927450304c4ae1022384aa8"}, + {file = "simsimd-4.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:872966082b39d6a67e80a3758fab53d590f8f39ecdc2b6a4a61bc9e8bb048aa7"}, + {file = "simsimd-4.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:91efe533afee774c3d3af18aa169650d24d617e549192e0a1c471fb6f9366f2f"}, + {file = "simsimd-4.3.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:adfae1bdc8d391d85705d5c57dfb395539b11d6705cfa649db9b7a9b8f44e86c"}, + {file = "simsimd-4.3.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc396c95cfba93fe3dfc00a446785e84ea0c43b153dd087a32406edb112db782"}, + {file = "simsimd-4.3.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74355cb1123f278fd58aa4f2ee85affcb223e2bf437262aecbc050ce82bc1c17"}, + {file = "simsimd-4.3.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ee9ddaa8d52207d7f26e86d30f9857798450d94a5e0411adcf229dadffcf17"}, + {file = "simsimd-4.3.1-cp36-cp36m-manylinux_2_28_aarch64.whl", hash = "sha256:808a85f8383a6b70741cd2916dbfc23e85a8450f0a2a43e6d4fd7c2efc41e143"}, + {file = "simsimd-4.3.1-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:e771afa919fc1886fb8def228200723f713e2db15f83504b7273f5893b4d8979"}, + {file = "simsimd-4.3.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:49a524f52c91c9633801248fb85475ff6d0690fd50c56f9753804596aca2fe9f"}, + {file = "simsimd-4.3.1-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:d39789aa24f0e3b2f858114b57719b94393259dc2079a261299e083573c725b7"}, + {file = "simsimd-4.3.1-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:c2248e8e3c712fbdf5428f7b626bca373ed425cdc4330ee2fb4f1b0cfbdf041f"}, + {file = "simsimd-4.3.1-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:817ea55402b074f0c7a62c73680fb658351dbdd8a227386374f464eda0db2c23"}, + {file = "simsimd-4.3.1-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:38caa87333b757796b126035e9295bf6199dfcce68fe4fe0356acc8f9c3f0dc2"}, + {file = "simsimd-4.3.1-cp36-cp36m-win32.whl", hash = "sha256:33ef86fc146040302ca5f4e4b342e463625162f99c3ce96170a8aea5582fa954"}, + {file = "simsimd-4.3.1-cp36-cp36m-win_amd64.whl", hash = "sha256:ab2deca67e770c4a73d5a4f2a78dd72ae3c27bc0c6156a07fabcd6b6f9bc6dbb"}, + {file = "simsimd-4.3.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5e2e650fd7168e7189ca8591495df63add305e2050010418a5f7330714b9e885"}, + {file = "simsimd-4.3.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd41805226d1376855186954adb674065fe6d73af7f2ace16bbb051a1020b8a0"}, + {file = "simsimd-4.3.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:caa64290c32a4255aacb6179c8791f3dcdf4d77cd2534055385326fa61afc77f"}, + {file = "simsimd-4.3.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc8830b14e908154299e35602bcb71970402afa6c6abd6e278cb1db3d52f3790"}, + {file = "simsimd-4.3.1-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:25d931c61ebc24ddcabeb38d5fa079d44bd1d9803f6e959b3d4996bda85162af"}, + {file = "simsimd-4.3.1-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:77c5af19d6163c8946ff41ddc6e0e44f4a7b1a53a9a694a34054c6bdb3842bf5"}, + {file = "simsimd-4.3.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2f44bbe06e5d8e520f08a9e3f45f6e3612e6ac5638cbd50330cd6134150d3e97"}, + {file = "simsimd-4.3.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c070cfc2d3d29502b4bcdc19f6d650d5de3554aa284063eee20b66c4f344d456"}, + {file = "simsimd-4.3.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:49eea7bd8b3b5980a29dd5f58a018ab37c37065bf3ea6c6a890133b88b40cc4c"}, + {file = "simsimd-4.3.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cf1622409f45a9fd9bdaf39080821d7192a29452a10ee1a2192f54cd9aac3102"}, + {file = "simsimd-4.3.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:c42013708bf72d8188d57ed04fe983d16113af58210b574d42f2e6ccb5d8f9f5"}, + {file = "simsimd-4.3.1-cp37-cp37m-win32.whl", hash = "sha256:c07e5b7e16fa0435603db54f2004f40fbfbb21ee057354760d80a7384d276168"}, + {file = "simsimd-4.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:d8dc7bee48a2cea4b3025e7391a33c2339e90c7620d6830e9d01699ad577d552"}, + {file = "simsimd-4.3.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:64e69afdd176617d04173056a88bfb312390ba197494de87b2c2ff1f81f6b40b"}, + {file = "simsimd-4.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:98c54a60a992ae5f4a4a4cbe9918bb6fd286d0f37f42a255158d9b15cc8270ed"}, + {file = "simsimd-4.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c9f574d666dc2427ea2a0580ce7c1cbffdff084626700fa038d203d92f54ebc0"}, + {file = "simsimd-4.3.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be952737bf8068af8156e0e543621f1e11aa0707814e9e2e2a0437f69ef8ab3a"}, + {file = "simsimd-4.3.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3306a66e8a7747e6ff134e8fa57763de5bbb6b27964eaf7e6f5558892c9cfab9"}, + {file = "simsimd-4.3.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a79e60aca578be5eedb8432a0aeecf0c795b9d58ced86fad94d0df2e187755b"}, + {file = "simsimd-4.3.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:10a4c11dcf03faf0b3fa2b555b1efdf2a601fc8352068afa82fad053337bbc1e"}, + {file = "simsimd-4.3.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:62ec20b85a680a6087c4a7d2fcdcbd7542bb8e6f250b48225cfa9e4d56f9839b"}, + {file = "simsimd-4.3.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b9d15a471d617b01153e2449aab4cb25d88ecbe09838925011124f1a52f4da08"}, + {file = "simsimd-4.3.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3fbe2b923c4cd81538d291a7efdb1375b2e7d5abca0d553d42e9ce4b8165c140"}, + {file = "simsimd-4.3.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:1766f62c2a1359d07068fbfe9fe85afc43fbc99fd01c61b8b37b78ebb77e36c6"}, + {file = "simsimd-4.3.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:790c2caf6e46537ff54d7bab48845e56cc40f5fdac17c254885e15fd829de9ae"}, + {file = "simsimd-4.3.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d7f6aa01c401d768f287c16e1c934e271827d47c853f1bac1e90a0948dea5c71"}, + {file = "simsimd-4.3.1-cp38-cp38-win32.whl", hash = "sha256:8329586d57deab8ef3b2637640be1ef19e109a318c2820441ea8d0e89e5fc804"}, + {file = "simsimd-4.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:c08e05f2632accdbd142cce6c9ee158d5edfea00b2e2f9b18a3e706f8b263e09"}, + {file = "simsimd-4.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3fae3e2099f0877ba144503c08e730f9531534376a6f94b101bffe6600fa5489"}, + {file = "simsimd-4.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:657f6c8ea3cb1b291a060ba669b7596a504d2ea579537954fc60360970e83d62"}, + {file = "simsimd-4.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ac719148cb30dcba5292891748ac1135d03b5e2226d521d6759334821c53e7a"}, + {file = "simsimd-4.3.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:71b7c7aba9a1ccf5ac282d989bd947e0292527ed9f991ae2de84206737181afa"}, + {file = "simsimd-4.3.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6501e9a5a2af9afbd13f9106c50ba6e8e0864841679d68d961d2058c5b296789"}, + {file = "simsimd-4.3.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fecbe50df7dca409660e721fac972ccccb5474908a110ac1cf078c617c8746a0"}, + {file = "simsimd-4.3.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:cdb34e67a34481641d4fa05c73aba7e554dda2849ad29e694e4e85eca1c40cb5"}, + {file = "simsimd-4.3.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:39802460c10cd5bee345b93ce57273ef411c1defe8a560c79ad49e4a37da771a"}, + {file = "simsimd-4.3.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8b92a35b21b6b756851e3353019b3747cee672a20f03641d6b9e6bf6ec519293"}, + {file = "simsimd-4.3.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:026c07b10f6768f827389bd00327864ff8e4692dfaf594e727555ede3a1965ce"}, + {file = "simsimd-4.3.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:930cf301c9233b790c7ec389b94429c36b33e950c483cc728f26434aaf52b5f5"}, + {file = "simsimd-4.3.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:900c08cde5221364add1edc6ea9df097193f2583bf4fd958c3c2c52dade1632c"}, + {file = "simsimd-4.3.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d9c1383a631ed12711c403cb764ccb7a84ce528b1de8254e5415f1610b228ae0"}, + {file = "simsimd-4.3.1-cp39-cp39-win32.whl", hash = "sha256:6c5e0c3eb4697877e06707f87a3160f78f3428f9a0a24e89bcc62382d1f78c00"}, + {file = "simsimd-4.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:74f929374e908f8884d8a69d17e3bb555f489c2f594668d2d3fdb53c51750616"}, + {file = "simsimd-4.3.1-cp39-cp39-win_arm64.whl", hash = "sha256:5a036fc98fdc56ee5bfd7633ebfd6867a5a6b2d3be3105cdcfdb7fd90609054f"}, +] + [[package]] name = "six" version = "1.16.0" @@ -1634,4 +1750,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "75cedf4b0a20f5ee30c546658d311b2d7b5aa86f49fc332d3bbfc3b58adaa2e7" +content-hash = "0ccace04d743e6685fb3a191e7f2eec78ea106bf76167e9419a8683b6a435206" diff --git a/libs/elasticsearch/pyproject.toml b/libs/elasticsearch/pyproject.toml index 22d69e0..1a3f908 100644 --- a/libs/elasticsearch/pyproject.toml +++ b/libs/elasticsearch/pyproject.toml @@ -13,7 +13,7 @@ license = "MIT" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" langchain-core = "^0.1" -elasticsearch = "^8.13.1" +elasticsearch = {version = "^8.13.1", extras = ["vectorstore_mmr"]} [tool.poetry.group.test] optional = true diff --git a/libs/elasticsearch/tests/integration_tests/test_vectorstores.py b/libs/elasticsearch/tests/integration_tests/test_vectorstores.py index 8601a40..097e461 100644 --- a/libs/elasticsearch/tests/integration_tests/test_vectorstores.py +++ b/libs/elasticsearch/tests/integration_tests/test_vectorstores.py @@ -2,9 +2,10 @@ import logging import uuid -from typing import Dict, Iterator +from typing import Any, Dict, Iterator, List, Optional, Union import pytest +from elasticsearch import NotFoundError from langchain_core.documents import Document from langchain_elasticsearch.vectorstores import ElasticsearchStore @@ -157,3 +158,752 @@ def test_search_by_vector_with_relevance_threshold( ] docsearch.close() + + # Also tested in elasticsearch.helpers.vectorstore + + def test_similarity_search_without_metadata( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search without metadata.""" + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == { + "knn": { + "field": "vector", + "filter": [], + "k": 1, + "num_candidates": 50, + "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + } + } + return query_body + + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + ) + output = docsearch.similarity_search("foo", k=1, custom_query=assert_query) + assert output == [Document(page_content="foo")] + + async def test_similarity_search_without_metadata_async( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search without metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + ) + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + def test_add_embeddings(self, es_params: dict, index_name: str) -> None: + """ + Test add_embeddings, which accepts pre-built embeddings instead of + using inference for the texts. + This allows you to separate the embeddings text and the page_content + for better proximity between user's question and embedded text. + For example, your embedding text can be a question, whereas page_content + is the answer. + """ + embeddings = ConsistentFakeEmbeddings() + text_input = ["foo1", "foo2", "foo3"] + metadatas = [{"page": i} for i in range(len(text_input))] + + """In real use case, embedding_input can be questions for each text""" + embedding_input = ["foo2", "foo3", "foo1"] + embedding_vectors = embeddings.embed_documents(embedding_input) + + docsearch = ElasticsearchStore( + embedding=embeddings, + **es_params, + index_name=index_name, + ) + docsearch.add_embeddings(list(zip(text_input, embedding_vectors)), metadatas) + output = docsearch.similarity_search("foo1", k=1) + assert output == [Document(page_content="foo3", metadata={"page": 2})] + + def test_similarity_search_with_metadata( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + **es_params, + index_name=index_name, + ) + + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + output = docsearch.similarity_search("bar", k=1) + assert output == [Document(page_content="bar", metadata={"page": 1})] + + def test_similarity_search_with_filter( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "foo", "foo"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + **es_params, + index_name=index_name, + ) + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == { + "knn": { + "field": "vector", + "filter": [{"term": {"metadata.page": "1"}}], + "k": 3, + "num_candidates": 50, + "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + } + } + return query_body + + output = docsearch.similarity_search( + query="foo", + k=3, + filter=[{"term": {"metadata.page": "1"}}], + custom_query=assert_query, + ) + assert output == [Document(page_content="foo", metadata={"page": 1})] + + def test_similarity_search_with_doc_builder( + self, es_params: dict, index_name: str + ) -> None: + texts = ["foo", "foo", "foo"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + **es_params, + index_name=index_name, + ) + + def custom_document_builder(_: Dict) -> Document: + return Document( + page_content="Mock content!", + metadata={ + "page_number": -1, + "original_filename": "Mock filename!", + }, + ) + + output = docsearch.similarity_search( + query="foo", k=1, doc_builder=custom_document_builder + ) + assert output[0].page_content == "Mock content!" + assert output[0].metadata["page_number"] == -1 + assert output[0].metadata["original_filename"] == "Mock filename!" + + def test_similarity_search_exact_search( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ExactRetrievalStrategy(), + ) + + expected_query = { + "query": { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", # noqa: E501 + "params": { + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ] + }, + }, + } + } + } + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == expected_query + return query_body + + output = docsearch.similarity_search("foo", k=1, custom_query=assert_query) + assert output == [Document(page_content="foo")] + + def test_similarity_search_exact_search_with_filter( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + metadatas=metadatas, + strategy=ElasticsearchStore.ExactRetrievalStrategy(), + ) + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + expected_query = { + "query": { + "script_score": { + "query": {"bool": {"filter": [{"term": {"metadata.page": 0}}]}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", # noqa: E501 + "params": { + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ] + }, + }, + } + } + } + assert query_body == expected_query + return query_body + + output = docsearch.similarity_search( + "foo", + k=1, + custom_query=assert_query, + filter=[{"term": {"metadata.page": 0}}], + ) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + def test_similarity_search_exact_search_distance_dot_product( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ExactRetrievalStrategy(), + distance_strategy="DOT_PRODUCT", + ) + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == { + "query": { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": """ + double value = dotProduct(params.query_vector, 'vector'); + return sigmoid(1, Math.E, -value); + """, + "params": { + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ] + }, + }, + } + } + } + return query_body + + output = docsearch.similarity_search("foo", k=1, custom_query=assert_query) + assert output == [Document(page_content="foo")] + + def test_similarity_search_exact_search_unknown_distance_strategy( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with unknown distance strategy.""" + + with pytest.raises(KeyError): + texts = ["foo", "bar", "baz"] + ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ExactRetrievalStrategy(), + distance_strategy="NOT_A_STRATEGY", + ) + + def test_max_marginal_relevance_search( + self, es_params: dict, index_name: str + ) -> None: + """Test max marginal relevance search.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ExactRetrievalStrategy(), + ) + + mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=3, fetch_k=3) + sim_output = docsearch.similarity_search(texts[0], k=3) + assert mmr_output == sim_output + + mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=2, fetch_k=3) + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == texts[0] + assert mmr_output[1].page_content == texts[1] + + mmr_output = docsearch.max_marginal_relevance_search( + texts[0], + k=2, + fetch_k=3, + lambda_mult=0.1, # more diversity + ) + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == texts[0] + assert mmr_output[1].page_content == texts[2] + + # if fetch_k < k, then the output will be less than k + mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=3, fetch_k=2) + assert len(mmr_output) == 2 + + def test_similarity_search_approx_with_hybrid_search( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ApproxRetrievalStrategy(hybrid=True), + ) + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == { + "knn": { + "field": "vector", + "filter": [], + "k": 1, + "num_candidates": 50, + "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + }, + "query": { + "bool": { + "filter": [], + "must": [{"match": {"text": {"query": "foo"}}}], + } + }, + "rank": {"rrf": {}}, + } + return query_body + + output = docsearch.similarity_search("foo", k=1, custom_query=assert_query) + assert output == [Document(page_content="foo")] + + def test_similarity_search_approx_by_vector( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + embeddings = ConsistentFakeEmbeddings() + docsearch = ElasticsearchStore.from_texts( + texts, + embedding=embeddings, + **es_params, + index_name=index_name, + ) + query_vector = embeddings.embed_query("foo") + + def assert_query(query_body: dict, query: str) -> Dict[str, Any]: + assert query_body == { + "knn": { + "field": "vector", + "filter": [], + "k": 1, + "num_candidates": 50, + "query_vector": query_vector, + }, + } + return query_body + + # accept ndarray as query vector + output = docsearch.similarity_search_by_vector_with_relevance_scores( + query_vector, + k=1, + custom_query=assert_query, + ) + assert output == [(Document(page_content="foo"), 1.0)] + + def test_similarity_search_approx_with_hybrid_search_rrf( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end construction and rrf hybrid search with metadata.""" + from functools import partial + + # 1. check query_body is okay + rrf_test_cases: List[Optional[Union[dict, bool]]] = [ + True, + False, + {"rank_constant": 1, "window_size": 5}, + ] + for rrf_test_case in rrf_test_cases: + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ApproxRetrievalStrategy( + hybrid=True, rrf=rrf_test_case + ), + ) + + def assert_query( + query_body: dict, + query: str, + rrf: Optional[Union[dict, bool]] = True, + ) -> dict: + cmp_query_body = { + "knn": { + "field": "vector", + "filter": [], + "k": 3, + "num_candidates": 50, + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ], + }, + "query": { + "bool": { + "filter": [], + "must": [{"match": {"text": {"query": "foo"}}}], + } + }, + } + + if isinstance(rrf, dict): + cmp_query_body["rank"] = {"rrf": rrf} + elif isinstance(rrf, bool) and rrf is True: + cmp_query_body["rank"] = {"rrf": {}} + + assert query_body == cmp_query_body + + return query_body + + ## without fetch_k parameter + output = docsearch.similarity_search( + "foo", k=3, custom_query=partial(assert_query, rrf=rrf_test_case) + ) + + # 2. check query result is okay + es_output = docsearch._store.client.search( + index=index_name, + query={ + "bool": { + "filter": [], + "must": [{"match": {"text": {"query": "foo"}}}], + } + }, + knn={ + "field": "vector", + "filter": [], + "k": 3, + "num_candidates": 50, + "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + }, + size=3, + rank={"rrf": {"rank_constant": 1, "window_size": 5}}, + ) + + assert [o.page_content for o in output] == [ + e["_source"]["text"] for e in es_output["hits"]["hits"] + ] + + # 3. check rrf default option is okay + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ApproxRetrievalStrategy(hybrid=True), + ) + + ## with fetch_k parameter + output = docsearch.similarity_search( + "foo", k=3, fetch_k=50, custom_query=assert_query + ) + + def test_similarity_search_approx_with_custom_query_fn( + self, es_params: dict, index_name: str + ) -> None: + """test that custom query function is called + with the query string and query body""" + + def my_custom_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query == "foo" + assert query_body == { + "knn": { + "field": "vector", + "filter": [], + "k": 1, + "num_candidates": 50, + "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], + } + } + return {"query": {"match": {"text": {"query": "bar"}}}} + + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, FakeEmbeddings(), **es_params, index_name=index_name + ) + output = docsearch.similarity_search("foo", k=1, custom_query=my_custom_query) + assert output == [Document(page_content="bar")] + + def test_deployed_model_check_fails_approx( + self, es_params: dict, index_name: str + ) -> None: + """test that exceptions are raised if a specified model is not deployed""" + with pytest.raises(NotFoundError): + ElasticsearchStore.from_texts( + texts=["foo", "bar", "baz"], + embedding=ConsistentFakeEmbeddings(10), + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.ApproxRetrievalStrategy( + query_model_id="non-existing model ID", + ), + ) + + def test_deployed_model_check_fails_sparse( + self, es_params: dict, index_name: str + ) -> None: + """test that exceptions are raised if a specified model is not deployed""" + with pytest.raises(NotFoundError): + ElasticsearchStore.from_texts( + texts=["foo", "bar", "baz"], + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.SparseVectorRetrievalStrategy( + model_id="non-existing model ID" + ), + ) + + def test_elasticsearch_with_relevance_score( + self, es_params: dict, index_name: str + ) -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + embeddings = FakeEmbeddings() + + docsearch = ElasticsearchStore.from_texts( + index_name=index_name, + texts=texts, + embedding=embeddings, + metadatas=metadatas, + **es_params, + ) + + embedded_query = embeddings.embed_query("foo") + output = docsearch.similarity_search_by_vector_with_relevance_scores( + embedding=embedded_query, k=1 + ) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] + + def test_similarity_search_bm25_search( + self, es_params: dict, index_name: str + ) -> None: + """Test end to end using the BM25 retrieval strategy.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticsearchStore.from_texts( + texts, + None, + **es_params, + index_name=index_name, + strategy=ElasticsearchStore.BM25RetrievalStrategy(), + ) + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == { + "query": { + "bool": { + "must": [{"match": {"text": {"query": "foo"}}}], + "filter": [], + } + } + } + return query_body + + output = docsearch.similarity_search("foo", k=1, custom_query=assert_query) + assert output == [Document(page_content="foo")] + + def test_similarity_search_bm25_search_with_filter( + self, es_params: dict, index_name: str + ) -> None: + """Test end to using the BM25 retrieval strategy with metadata.""" + texts = ["foo", "foo", "foo"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore.from_texts( + texts, + None, + **es_params, + index_name=index_name, + metadatas=metadatas, + strategy=ElasticsearchStore.BM25RetrievalStrategy(), + ) + + def assert_query( + query_body: Dict[str, Any], query: Optional[str] + ) -> Dict[str, Any]: + assert query_body == { + "query": { + "bool": { + "must": [{"match": {"text": {"query": "foo"}}}], + "filter": [{"term": {"metadata.page": 1}}], + } + } + } + return query_body + + output = docsearch.similarity_search( + "foo", + k=3, + custom_query=assert_query, + filter=[{"term": {"metadata.page": 1}}], + ) + assert output == [Document(page_content="foo", metadata={"page": 1})] + + def test_elasticsearch_with_relevance_threshold( + self, es_params: dict, index_name: str + ) -> None: + """Test to make sure the relevance threshold is respected.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + embeddings = FakeEmbeddings() + + docsearch = ElasticsearchStore.from_texts( + index_name=index_name, + texts=texts, + embedding=embeddings, + metadatas=metadatas, + **es_params, + ) + + # Find a good threshold for testing + query_string = "foo" + embedded_query = embeddings.embed_query(query_string) + top3 = docsearch.similarity_search_by_vector_with_relevance_scores( + embedding=embedded_query, k=3 + ) + similarity_of_second_ranked = top3[1][1] + assert len(top3) == 3 + + # Test threshold + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"score_threshold": similarity_of_second_ranked}, + ) + output = retriever.get_relevant_documents(query=query_string) + + assert output == [ + top3[0][0], + top3[1][0], + # third ranked is out + ] + + def test_elasticsearch_delete_ids(self, es_params: dict, index_name: str) -> None: + """Test delete methods from vector store.""" + texts = ["foo", "bar", "baz", "gni"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore( + embedding=ConsistentFakeEmbeddings(), + **es_params, + index_name=index_name, + ) + + ids = docsearch.add_texts(texts, metadatas) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 4 + + docsearch.delete(ids[1:3]) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 2 + + docsearch.delete(["not-existing"]) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 2 + + docsearch.delete([ids[0]]) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 1 + + docsearch.delete([ids[3]]) + output = docsearch.similarity_search("gni", k=10) + assert len(output) == 0