diff --git a/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb b/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb new file mode 100644 index 000000000..b1d55875c --- /dev/null +++ b/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb @@ -0,0 +1,718 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full Text Search with Milvus\n", + "\n", + "With the release of Milvus 2.5, Full Text Search enables users to efficiently search for text based on keywords or phrases, providing powerful text retrieval capabilities. This feature enhances search accuracy and can be seamlessly combined with embedding-based retrieval for hybrid search, allowing for both semantic and keyword-based results in a single query. In this notebook, we will show basic usage of full text search in Milvus." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation\n", + "\n", + "### Download the dataset\n", + "The following command will download the example data used in original Anthropic [demo](https://github.com/anthropics/anthropic-cookbook/blob/main/skills/contextual-embeddings/guide.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "powershell" + } + }, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/anthropics/anthropic-cookbook/refs/heads/main/skills/contextual-embeddings/data/codebase_chunks.json\n", + "!wget https://raw.githubusercontent.com/anthropics/anthropic-cookbook/refs/heads/main/skills/contextual-embeddings/data/evaluation_set.jsonl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Milvus 2.5\n", + "Check the [official installation guide](https://milvus.io/docs/install_standalone-docker-compose.md) for more details.\n", + "\n", + "### Install PyMilvs\n", + "Run the following command to install PyMilvus:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "powershell" + } + }, + "outputs": [], + "source": [ + "pip install \"pymilvus[model]\" -U " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure the Dense Embedding Key\n", + "In this notebook, we will use Voyage AI's embedding model to generate dense embedding, please set the `VOYAGE_API` in your envirioment variables. Or you can change different dense embedding model easily.\n", + "\n", + "### Define the Retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data3/david/git/hybrid_exp/runtime/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "\n", + "from pymilvus import (\n", + " MilvusClient,\n", + " DataType,\n", + " Function,\n", + " FunctionType,\n", + " AnnSearchRequest,\n", + " RRFRanker,\n", + ")\n", + "\n", + "from pymilvus.model.dense import VoyageEmbeddingFunction\n", + "\n", + "\n", + "class HybridRetriever:\n", + " def __init__(self, uri, collection_name=\"hybrid\", dense_embedding_function=None):\n", + " self.uri = uri\n", + " self.collection_name = collection_name\n", + " self.embedding_function = dense_embedding_function\n", + " self.use_reranker = True\n", + " self.use_sparse = True\n", + " self.client = MilvusClient(uri=uri)\n", + "\n", + " def build_collection(self):\n", + " if isinstance(self.embedding_function.dim, dict):\n", + " dense_dim = self.embedding_function.dim[\"dense\"]\n", + " else:\n", + " dense_dim = self.embedding_function.dim\n", + "\n", + " tokenizer_params = {\n", + " \"tokenizer\": \"standard\",\n", + " \"filter\": [\n", + " \"lowercase\",\n", + " {\n", + " \"type\": \"length\",\n", + " \"max\": 200,\n", + " },\n", + " {\"type\": \"stemmer\", \"language\": \"english\"},\n", + " {\n", + " \"type\": \"stop\",\n", + " \"stop_words\": [\n", + " \"a\",\n", + " \"an\",\n", + " \"and\",\n", + " \"are\",\n", + " \"as\",\n", + " \"at\",\n", + " \"be\",\n", + " \"but\",\n", + " \"by\",\n", + " \"for\",\n", + " \"if\",\n", + " \"in\",\n", + " \"into\",\n", + " \"is\",\n", + " \"it\",\n", + " \"no\",\n", + " \"not\",\n", + " \"of\",\n", + " \"on\",\n", + " \"or\",\n", + " \"such\",\n", + " \"that\",\n", + " \"the\",\n", + " \"their\",\n", + " \"then\",\n", + " \"there\",\n", + " \"these\",\n", + " \"they\",\n", + " \"this\",\n", + " \"to\",\n", + " \"was\",\n", + " \"will\",\n", + " \"with\",\n", + " ],\n", + " },\n", + " ],\n", + " }\n", + "\n", + " schema = MilvusClient.create_schema()\n", + " schema.add_field(\n", + " field_name=\"pk\",\n", + " datatype=DataType.VARCHAR,\n", + " is_primary=True,\n", + " auto_id=True,\n", + " max_length=100,\n", + " )\n", + " schema.add_field(\n", + " field_name=\"content\",\n", + " datatype=DataType.VARCHAR,\n", + " max_length=65535,\n", + " analyzer_params=tokenizer_params,\n", + " enable_match=True,\n", + " enable_analyzer=True,\n", + " )\n", + " schema.add_field(\n", + " field_name=\"sparse_vector\", datatype=DataType.SPARSE_FLOAT_VECTOR\n", + " )\n", + " schema.add_field(\n", + " field_name=\"dense_vector\", datatype=DataType.FLOAT_VECTOR, dim=dense_dim\n", + " )\n", + " schema.add_field(\n", + " field_name=\"original_uuid\", datatype=DataType.VARCHAR, max_length=128\n", + " )\n", + " schema.add_field(field_name=\"doc_id\", datatype=DataType.VARCHAR, max_length=64)\n", + " schema.add_field(\n", + " field_name=\"chunk_id\", datatype=DataType.VARCHAR, max_length=64\n", + " ),\n", + " schema.add_field(field_name=\"original_index\", datatype=DataType.INT32)\n", + "\n", + " functions = Function(\n", + " name=\"bm25\",\n", + " function_type=FunctionType.BM25,\n", + " input_field_names=[\"content\"],\n", + " output_field_names=\"sparse_vector\",\n", + " )\n", + "\n", + " schema.add_function(functions)\n", + "\n", + " index_params = MilvusClient.prepare_index_params()\n", + " index_params.add_index(\n", + " field_name=\"sparse_vector\",\n", + " index_type=\"SPARSE_INVERTED_INDEX\",\n", + " metric_type=\"BM25\",\n", + " )\n", + " index_params.add_index(\n", + " field_name=\"dense_vector\", index_type=\"FLAT\", metric_type=\"IP\"\n", + " )\n", + "\n", + " self.client.create_collection(\n", + " collection_name=self.collection_name,\n", + " schema=schema,\n", + " index_params=index_params,\n", + " )\n", + "\n", + " def insert_data(self, chunk, metadata):\n", + " embedding = self.embedding_function([chunk])\n", + " if isinstance(embedding, dict) and \"dense\" in embedding:\n", + " dense_vec = embedding[\"dense\"][0]\n", + " else:\n", + " dense_vec = embedding[0]\n", + " self.client.insert(\n", + " self.collection_name, {\"dense_vector\": dense_vec, **metadata}\n", + " )\n", + "\n", + " def search(self, query: str, k: int = 20, mode=\"hybrid\"):\n", + "\n", + " output_fields = [\n", + " \"content\",\n", + " \"original_uuid\",\n", + " \"doc_id\",\n", + " \"chunk_id\",\n", + " \"original_index\",\n", + " ]\n", + " if mode in [\"dense\", \"hybrid\"]:\n", + " embedding = self.embedding_function([query])\n", + " if isinstance(embedding, dict) and \"dense\" in embedding:\n", + " dense_vec = embedding[\"dense\"][0]\n", + " else:\n", + " dense_vec = embedding[0]\n", + "\n", + " if mode == \"sparse\":\n", + " results = self.client.search(\n", + " collection_name=self.collection_name,\n", + " data=[query],\n", + " anns_field=\"sparse_vector\",\n", + " limit=k,\n", + " output_fields=output_fields,\n", + " )\n", + " elif mode == \"dense\":\n", + " results = self.client.search(\n", + " collection_name=self.collection_name,\n", + " data=[dense_vec],\n", + " anns_field=\"dense_vector\",\n", + " limit=k,\n", + " output_fields=output_fields,\n", + " )\n", + " elif mode == \"hybrid\":\n", + " full_text_search_params = {\"metric_type\": \"BM25\"}\n", + " full_text_search_req = AnnSearchRequest(\n", + " [query], \"sparse_vector\", full_text_search_params, limit=k\n", + " )\n", + "\n", + " dense_search_params = {\"metric_type\": \"IP\"}\n", + " dense_req = AnnSearchRequest(\n", + " [dense_vec], \"dense_vector\", dense_search_params, limit=k\n", + " )\n", + "\n", + " results = self.client.hybrid_search(\n", + " self.collection_name,\n", + " [full_text_search_req, dense_req],\n", + " ranker=RRFRanker(),\n", + " limit=k,\n", + " output_fields=output_fields,\n", + " )\n", + " else:\n", + " raise ValueError(\"Invalid mode\")\n", + " return [\n", + " {\n", + " \"doc_id\": doc[\"entity\"][\"doc_id\"],\n", + " \"chunk_id\": doc[\"entity\"][\"chunk_id\"],\n", + " \"content\": doc[\"entity\"][\"content\"],\n", + " \"score\": doc[\"distance\"],\n", + " }\n", + " for doc in results[0]\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "dense_ef = VoyageEmbeddingFunction(\n", + " api_key=os.getenv(\"VOYAGE_API\"), model_name=\"voyage-2\"\n", + ")\n", + "standard_retriever = HybridRetriever(\n", + " uri=\"http://localhost:19530\",\n", + " collection_name=\"milvus_hybrid\",\n", + " dense_embedding_function=dense_ef,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Insert the data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "path = \"codebase_chunks.json\"\n", + "with open(path, \"r\") as f:\n", + " dataset = json.load(f)\n", + "\n", + "is_insert = True\n", + "if is_insert:\n", + " standard_retriever.build_collection()\n", + " for doc in dataset:\n", + " doc_content = doc[\"content\"]\n", + " for chunk in doc[\"chunks\"]:\n", + " metadata = {\n", + " \"doc_id\": doc[\"doc_id\"],\n", + " \"original_uuid\": doc[\"original_uuid\"],\n", + " \"chunk_id\": chunk[\"chunk_id\"],\n", + " \"original_index\": chunk[\"original_index\"],\n", + " \"content\": chunk[\"content\"],\n", + " }\n", + " chunk_content = chunk[\"content\"]\n", + " standard_retriever.insert_data(chunk_content, metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test Sparse Search" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'doc_id': 'doc_10', 'chunk_id': 'doc_10_chunk_0', 'content': 'use {\\n crate::args::LogArgs,\\n anyhow::{anyhow, Result},\\n simplelog::{Config, LevelFilter, WriteLogger},\\n std::fs::File,\\n};\\n\\npub struct Logger;\\n\\nimpl Logger {\\n pub fn init(args: &impl LogArgs) -> Result<()> {\\n let filter: LevelFilter = args.log_level().into();\\n if filter != LevelFilter::Off {\\n let logfile = File::create(args.log_file())\\n .map_err(|e| anyhow!(\"Failed to open log file: {e:}\"))?;\\n WriteLogger::init(filter, Config::default(), logfile)\\n .map_err(|e| anyhow!(\"Failed to initalize logger: {e:}\"))?;\\n }\\n Ok(())\\n }\\n}\\n', 'score': 9.12518310546875}, {'doc_id': 'doc_87', 'chunk_id': 'doc_87_chunk_3', 'content': '\\t\\tLoggerPtr INF = Logger::getLogger(LOG4CXX_TEST_STR(\"INF\"));\\n\\t\\tINF->setLevel(Level::getInfo());\\n\\n\\t\\tLoggerPtr INF_ERR = Logger::getLogger(LOG4CXX_TEST_STR(\"INF.ERR\"));\\n\\t\\tINF_ERR->setLevel(Level::getError());\\n\\n\\t\\tLoggerPtr DEB = Logger::getLogger(LOG4CXX_TEST_STR(\"DEB\"));\\n\\t\\tDEB->setLevel(Level::getDebug());\\n\\n\\t\\t// Note: categories with undefined level\\n\\t\\tLoggerPtr INF_UNDEF = Logger::getLogger(LOG4CXX_TEST_STR(\"INF.UNDEF\"));\\n\\t\\tLoggerPtr INF_ERR_UNDEF = Logger::getLogger(LOG4CXX_TEST_STR(\"INF.ERR.UNDEF\"));\\n\\t\\tLoggerPtr UNDEF = Logger::getLogger(LOG4CXX_TEST_STR(\"UNDEF\"));\\n\\n', 'score': 7.0077056884765625}, {'doc_id': 'doc_89', 'chunk_id': 'doc_89_chunk_3', 'content': 'using namespace log4cxx;\\nusing namespace log4cxx::helpers;\\n\\nLOGUNIT_CLASS(FMTTestCase)\\n{\\n\\tLOGUNIT_TEST_SUITE(FMTTestCase);\\n\\tLOGUNIT_TEST(test1);\\n\\tLOGUNIT_TEST(test1_expanded);\\n\\tLOGUNIT_TEST(test10);\\n//\\tLOGUNIT_TEST(test_date);\\n\\tLOGUNIT_TEST_SUITE_END();\\n\\n\\tLoggerPtr root;\\n\\tLoggerPtr logger;\\n\\npublic:\\n\\tvoid setUp()\\n\\t{\\n\\t\\troot = Logger::getRootLogger();\\n\\t\\tMDC::clear();\\n\\t\\tlogger = Logger::getLogger(LOG4CXX_TEST_STR(\"java.org.apache.log4j.PatternLayoutTest\"));\\n\\t}\\n\\n', 'score': 6.750633716583252}]\n" + ] + } + ], + "source": [ + "results = standard_retriever.search(\"create a logger?\", mode=\"sparse\", k=3)\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation\n", + "Now that we have inserted the dataset into Milvus, we can use dense, sparse, or hybrid search to retrieve the top 5 results. You can change the `mode` and evaluate each one. We present the Pass@5 metric, which involves retrieving the top 5 results for each query and calculating the Recall." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What is the purpose of the DiffExecutor struct?\n", + "How do you create a new DiffExecutor instance?\n", + "What happens in the `run_target` method of the DiffExecutor?\n", + "What is the purpose of the SIGNALS and SIGNALS_PTR static variables?\n", + "How does the harness closure work?\n", + "What is the purpose of the StdMapObserver?\n", + "What feedbacks are used in this fuzzer?\n", + "How is the initial corpus generated?\n", + "What stages are used in the fuzzer?\n", + "What does the `OomObserver` struct do?\n", + "How do I create a new `OomObserver`?\n", + "What does the function `both_require` do?\n", + "How does `both_require` check for the sequence 'a', 'b', 'c'?\n", + "What is the purpose of the `len` parameter in `both_require`?\n", + "What does the vuln() function do?\n", + "How is input normally read in the main() function?\n", + "What input condition causes the program to abort in the main() function?\n", + "What is the purpose of the `MergeScheduler` struct?\n", + "How does the `on_add` method of the `MergeScheduler` work?\n", + "What is the purpose of the `removable()` method in the `MergeScheduler`?\n", + "How does the `on_remove` method of the `MergeScheduler` work?\n", + "What is the purpose of the `current()` method in the `MergeScheduler`?\n", + "Why is the `next()` method of the `MergeScheduler` unimplemented?\n", + "How are the `Fp` and `Lr` registers defined as aliases in the `Regs` enum?\n", + "What is the purpose of the `get_backdoor_arch_regs` function?\n", + "How do I get the `EnumMap` of backdoor architecture registers?\n", + "How do you convert a `NautilusInput` to a `BytesInput`?\n", + "How do you get the `Tree` representation of a `NautilusInput`?\n", + "What traits does `NautilusInput` implement?\n", + "How do you initialize the logger?\n", + "How is the log file created?\n", + "What logger implementation is being used?\n", + "How do you register a new type in the Registry?\n", + "What is the purpose of the `_real_register` method?\n", + "How can you retrieve registered types from the Registry?\n", + "How are targeted types handled in the Registry?\n", + "What is the purpose of the `_modules` set in the Registry?\n", + "What does the Octal class do?\n", + "How does the decode method of the Octal class work?\n", + "What does the getTarget method of the Octal class do?\n", + "What external dependencies does the Octal class have?\n", + "How does the decode method of the A1z26 class work?\n", + "What is the purpose of the priority method in the A1z26 class?\n", + "What does the getParams method do in the A1z26 class?\n", + "What is the purpose of the getTarget method?\n", + "How are the delimiters in the input ciphertext handled?\n", + "What is the purpose of the priority method in the Base58_ripple class?\n", + "What does the getParams method of the Base58_ripple class do?\n", + "What is the purpose of the getTarget method in the Base58_ripple class?\n", + "How are the character and word boundaries determined in the Morse code decoding process?\n", + "What is the purpose of the priority method in the Morse_code class?\n", + "What does the `getInfo` method of the `Soundex` class do?\n", + "What does the `getTarget` method of the `Soundex` class return?\n", + "How does the `attemptCrack` method of the `Soundex` class attempt to crack a Soundex-encoded ciphertext?\n", + "What does the `sortlistwithdict` method of the `Soundex` class do?\n", + "What parameters does the `Soundex` class take in its constructor?\n", + "What parameters can be configured for the Tap_code decoder?\n", + "How does the CipheyDists class handle configuration?\n", + "What is the priority method used for in the Base69 class?\n", + "How are the parameters for the Base69 class specified?\n", + "What encryption schemes do the tests cover?\n", + "What is the expected decrypted plaintext used in most of the tests?\n", + "What does the MakeBools function return?\n", + "How does the MakeFixedStrings function work?\n", + "What is the purpose of the long string in the MakeStrings function?\n", + "What UUID values are returned by the MakeUUIDs function?\n", + "How can I append a column to a ColumnTuple?\n", + "How do I load column data from an input stream into a ColumnTuple?\n", + "How can I clear the data in a ColumnTuple?\n", + "How do I get the number of rows in a ColumnTuple?\n", + "What is the purpose of the ColumnTupleT class?\n", + "How can you append elements to a ColumnIPv4 instance?\n", + "How can you access elements from a ColumnIPv4 instance?\n", + "How can you append the content of another column to a ColumnIPv4 instance?\n", + "How can you get the number of rows in a ColumnIPv4 instance?\n", + "How can you create a slice of a ColumnIPv4 instance?\n", + "What does the GetTypeMeta() function do?\n", + "How does the CompateStringsCaseInsensitive() function compare two strings case-insensitively?\n", + "What regular expression syntax is supported on Windows and Mac for death tests?\n", + "What is a known caveat with \"threadsafe\" style death tests?\n", + "How do you read a string using WireFormat?\n", + "How do you read a 64-bit unsigned integer using WireFormat?\n", + "What is the purpose of the LoadPrefix function in the Column class?\n", + "What is the purpose of the SavePrefix function in the Column class?\n", + "How does the Save function in the Column class work?\n", + "How does the ColumnLowCardinality class handle null values?\n", + "What geometric data types are supported by the code?\n", + "How can you append an element to a ColumnGeo?\n", + "How can you access an element in a ColumnGeo?\n", + "How can you append the content of one ColumnGeo to another?\n", + "How can you clear the data of a ColumnGeo?\n", + "How do you construct a ProjectedIterator?\n", + "How do you increment and decrement a ProjectedIterator?\n", + "What are the possible values for the ConsoleOutput enum?\n", + "What package is the ConsoleOutput enum defined in?\n", + "What do the different values of the ConsoleOutput enum represent?\n", + "How does the UpdateChecker store the timestamp of the last update check?\n", + "What does the UpdateChecker return if the current version is up to date?\n", + "How does the DefaultCredentialRetrievers class handle credential helpers on Windows?\n", + "How does the DefaultCredentialRetrievers class avoid duplicate CredentialRetriever instances?\n", + "What does the `ReproducibleImageTest` test class verify?\n", + "How does the `createImage()` method create the test image?\n", + "What is the expected tarball structure and how is it verified in `testTarballStructure()`?\n", + "What package does the HelloWorld class belong to?\n", + "What license is this code released under?\n", + "What year was this code copyrighted?\n", + "What company owns the copyright to this code?\n", + "How does MavenSettingsServerCredentials infer credentials for a server?\n", + "What exceptions can be thrown when inferring credentials with MavenSettingsServerCredentials?\n", + "What is the format of the returned AuthProperty when inferring credentials?\n", + "How are the test settings files used in the tests?\n", + "How does the testPull() method verify the correctness of the pulled BLOB?\n", + "How is the RegistryClient instance created in the test methods?\n", + "How does JibBuildRunner handle a RegistryUnauthorizedException with a 403 Forbidden status code?\n", + "How does the buildToDockerDaemonAndRun method verify the built image?\n", + "How does the testExecute_dockerClient test work?\n", + "What exception is thrown when registry authentication fails?\n", + "What information is included in the exception message when a `RegistryAuthenticationFailedException` is thrown?\n", + "What is the default length of a generated pepper when no length is specified?\n", + "Is it possible to generate a pepper with a length of zero?\n", + "What is the expected value of the pepper returned by the PepperGenerator.get() method?\n", + "What is the purpose of the slowEquals method that takes two CharSequence objects?\n", + "How does the hash method handle the presence or absence of a salt value?\n", + "What are the input parameters for the BalloonHashingFunction constructor?\n", + "How can I obtain an instance of the BalloonHashingFunction using the factory method?\n", + "What are the test vectors used in the `TEST_VECTORS` array?\n", + "What is the maximum digest size supported by this Blake2b implementation?\n", + "How do you reset the hasher to its initial state?\n", + "What class does BadParametersException extend?\n", + "What package does the BadParametersException class belong to?\n", + "Who is the author of the BadParametersException class?\n", + "Since which version has the BadParametersException class been available?\n", + "What is the purpose of the Hash class?\n", + "What information does the Hash class store?\n", + "How do I create a HashBuilder instance?\n", + "How do I specify the hashing algorithm to use with the HashBuilder?\n", + "What is the difference between withPBKDF2() and withCompressedPBKDF2()?\n", + "How do you create an instance of MessageDigestFunction with a specific hashing algorithm and salt option?\n", + "How do you hash a password using MessageDigestFunction?\n", + "How do you check if a password matches a hashed value using MessageDigestFunction?\n", + "How can you retrieve the hashing algorithm and salt option used by a MessageDigestFunction instance?\n", + "What does the test method `issue92()` do?\n", + "What does the test method `issue99()` do?\n", + "What does the test method `issue93()` do?\n", + "What does the test method `issue120()` do?\n", + "Which package does the Argon2 enum belong to?\n", + "What is the purpose of the `Tag` class?\n", + "What is the purpose of the `pull` function in the `Tag` class?\n", + "How does the Serializer class serialize branch control instructions like br and br_if?\n", + "What is the purpose of the `Log` class?\n", + "What is the expected behavior of the `body` method of the `Log` class?\n", + "What is the purpose of the `procRaise` function in the `Environ` class?\n", + "How does the `procRaise` function handle unsupported signals?\n", + "What is the purpose of the `printPluginMock` function?\n", + "What is the purpose of the `body` method in the mock classes?\n", + "How do you access the magic bytes of a Module?\n", + "What sections are contained in a Module?\n", + "How do you check if a Module has been validated?\n", + "How can I retrieve the current log level using the provided classes?\n", + "How can I set an integer option value using the provided classes?\n", + "What is the purpose of the AVRescaleQ class?\n", + "What is the purpose of the WasmEdge::PO namespace?\n", + "What error codes are defined in the ErrCode enum?\n", + "What data does the Error class store?\n", + "How can I construct an Error object?\n", + "How can I access the error code and message of an Error object?\n", + "How can I set the ID of a specific chapter using the provided functions?\n", + "How can I set the time base of a specific chapter?\n", + "What does the AVChapterStart function do?\n", + "What is the purpose of the `importPk` function?\n", + "How does the `pkExportData` function work?\n", + "How do you create a new instance of the `Pty` struct?\n", + "How can you access the child process watcher associated with a `Pty` instance?\n", + "What does the `merge` function do?\n", + "How does the `merge_tables` function work?\n", + "What does the `merge_sequence` test verify?\n", + "How does the `get_pw_entry` function work?\n", + "What fields does the `Mouse` struct contain?\n", + "How are the default mouse bindings obtained in the `default` implementation of `MouseBindings`?\n", + "How is deserialization handled for the `MouseBindings` struct?\n", + "What is the purpose of the `MouseBinding` type?\n", + "What is the purpose of the `new_nop` method?\n", + "How does the Scheduler schedule a new event?\n", + "How can you cancel a scheduled event?\n", + "How can you check if a timer is already scheduled?\n", + "What does the `attach_handler` function do?\n", + "How do you create a new `Row` instance?\n", + "What does the `grow` method do in the `Row` struct?\n", + "How does the `shrink` method work in the `Row` struct?\n", + "What is the purpose of the `Debug` struct?\n", + "How can I specify the renderer preference in Alacritty?\n", + "What is the purpose of the Display struct?\n", + "How does the Display struct handle configuration updates?\n", + "What is the purpose of the FrameTimer struct?\n", + "What class does HasWeightCol extend?\n", + "What is the name and description of the weight column parameter?\n", + "How can I get the value of the weight column parameter?\n", + "What is the default value of the weight column parameter?\n", + "What parameters does the VectorSlicer class have?\n", + "How do I specify the indices to slice from the input vector?\n", + "What restrictions are there on the indices parameter?\n", + "How does an operator or UDF get access to the current epoch number?\n", + "Are the vector and l2Norm fields mutable in the VectorWithNorm class?\n", + "How do you set the parameters of the UnivariateFeatureSelector?\n", + "What happens if you don't set the feature_type or label_type parameters of the UnivariateFeatureSelector?\n", + "What is the output schema of the UnivariateFeatureSelectorModel after transforming data?\n", + "How does the testParam() method verify the parameter settings of LinearRegression?\n", + "What does the testInputTypeConversion() method verify?\n", + "What does the testRegularization() method check?\n", + "What is the maximum size of a segment that the MemorySegmentWriter will write?\n", + "What is the purpose of the HeadOperatorCoordinator class?\n", + "How does the HeadOperatorCoordinator determine when to send out a GloballyAlignedEvent?\n", + "How is the input data generated in this example?\n", + "How are the results extracted and displayed in this example?\n", + "What execution environment is used in this example?\n", + "How do you create an IndexToStringModel instance?\n", + "How do you set the model data for the IndexToStringModel?\n", + "How do you extract and display the results after applying the IndexToStringModel?\n", + "How do you create a new instance of the ColorEndPatternConverter?\n", + "What parameters does the ColorEndPatternConverter constructor take?\n", + "What namespaces are used in this file?\n", + "What does the XMLFilenameFilter constructor do?\n", + "What is the purpose of the pattern and replacement strings constructed in the XMLFilenameFilter constructor?\n", + "How would the XMLFilenameFilter be used?\n", + "What C++ standard library classes are used in this code?\n", + "How does the testTrigger unit test work?\n", + "How does the testValid unit test work?\n", + "How do you configure the logging system with a specific layout using BasicConfigurator?\n", + "How do you configure the logging system with a specific appender using BasicConfigurator?\n", + "How do you reset the logging configuration to its default state using BasicConfigurator?\n", + "What is the default layout used by BasicConfigurator if no layout is provided?\n", + "What is the purpose of the WideLife template class used in the code?\n", + "How does the hexdump function handle different character types for logging?\n", + "How do you create a DenyAllFilter instance?\n", + "What do you need to do after creating a DenyAllFilter instance?\n", + "What is the signature of the decide() method of the DenyAllFilter?\n", + "What namespaces are used in this file?\n", + "What is the purpose of the MinimumTestCase class?\n", + "What is the purpose of the common() method?\n", + "How do you obtain an instance of NDCPatternConverter?\n", + "How does the NDCPatternConverter format the logging event?\n", + "What is the inheritance hierarchy of NDCPatternConverter?\n", + "What macros are used in the NDCPatternConverter class declaration?\n", + "How does the test1() method test the FMTLayout?\n", + "What is the purpose of the common() method?\n", + "How do you construct a BufferedWriter object?\n", + "What methods does BufferedWriter override from its parent class?\n", + "How does BufferedWriter handle object destruction?\n", + "Pass@5: 0.821716589861751\n" + ] + } + ], + "source": [ + "def load_jsonl(file_path: str):\n", + " \"\"\"Load JSONL file and return a list of dictionaries.\"\"\"\n", + " with open(file_path, \"r\") as file:\n", + " return [json.loads(line) for line in file]\n", + "\n", + "\n", + "dataset = load_jsonl(\"evaluation_set.jsonl\")\n", + "k = 5\n", + "\n", + "# mode can be \"dense\", \"sparse\" or \"hybrid\".\n", + "mode = \"hybrid\"\n", + "\n", + "total_query_score = 0\n", + "num_queries = 0\n", + "\n", + "for query_item in dataset:\n", + "\n", + " query = query_item[\"query\"]\n", + " print(query)\n", + "\n", + " golden_chunk_uuids = query_item[\"golden_chunk_uuids\"]\n", + "\n", + " chunks_found = 0\n", + " golden_contents = []\n", + " for doc_uuid, chunk_index in golden_chunk_uuids:\n", + " golden_doc = next(\n", + " (doc for doc in query_item[\"golden_documents\"] if doc[\"uuid\"] == doc_uuid),\n", + " None,\n", + " )\n", + " if golden_doc:\n", + " golden_chunk = next(\n", + " (\n", + " chunk\n", + " for chunk in golden_doc[\"chunks\"]\n", + " if chunk[\"index\"] == chunk_index\n", + " ),\n", + " None,\n", + " )\n", + " if golden_chunk:\n", + " golden_contents.append(golden_chunk[\"content\"].strip())\n", + "\n", + " results = standard_retriever.search(query, mode=mode, k=5)\n", + "\n", + " for golden_content in golden_contents:\n", + " for doc in results[:k]:\n", + " retrieved_content = doc[\"content\"].strip()\n", + " if retrieved_content == golden_content:\n", + " chunks_found += 1\n", + " break\n", + "\n", + " query_score = chunks_found / len(golden_contents)\n", + "\n", + " total_query_score += query_score\n", + " num_queries += 1\n", + "print(\"Pass@5: \", total_query_score / num_queries)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "runtime", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}