From 4d602a108d1ba0cc977b493d59061b8f8a0d6e47 Mon Sep 17 00:00:00 2001 From: wxywb Date: Thu, 5 Dec 2024 03:11:58 +0000 Subject: [PATCH] Update full text search notebook. Signed-off-by: wxywb --- .../full_text_search_with_milvus.ipynb | 329 +++--------------- 1 file changed, 47 insertions(+), 282 deletions(-) diff --git a/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb b/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb index b1d55875c..21a456682 100644 --- a/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb +++ b/bootcamp/tutorials/quickstart/full_text_search_with_milvus.ipynb @@ -1,5 +1,18 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "\"Open \n", + " \"GitHub" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -40,7 +53,7 @@ "### Install Milvus 2.5\n", "Check the [official installation guide](https://milvus.io/docs/install_standalone-docker-compose.md) for more details.\n", "\n", - "### Install PyMilvs\n", + "### Install PyMilvus\n", "Run the following command to install PyMilvus:" ] }, @@ -61,9 +74,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Configure the Dense Embedding Key\n", - "In this notebook, we will use Voyage AI's embedding model to generate dense embedding, please set the `VOYAGE_API` in your envirioment variables. Or you can change different dense embedding model easily.\n", - "\n", "### Define the Retriever" ] }, @@ -71,19 +81,9 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data3/david/git/hybrid_exp/runtime/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import json\n", - "import os\n", "\n", "from pymilvus import (\n", " MilvusClient,\n", @@ -94,7 +94,7 @@ " RRFRanker,\n", ")\n", "\n", - "from pymilvus.model.dense import VoyageEmbeddingFunction\n", + "from pymilvus.model.hybrid import BGEM3EmbeddingFunction\n", "\n", "\n", "class HybridRetriever:\n", @@ -295,11 +295,17 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 108848.72it/s]\n" + ] + } + ], "source": [ - "dense_ef = VoyageEmbeddingFunction(\n", - " api_key=os.getenv(\"VOYAGE_API\"), model_name=\"voyage-2\"\n", - ")\n", + "dense_ef = BGEM3EmbeddingFunction()\n", "standard_retriever = HybridRetriever(\n", " uri=\"http://localhost:19530\",\n", " collection_name=\"milvus_hybrid\",\n", @@ -316,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -350,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -376,265 +382,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "What is the purpose of the DiffExecutor struct?\n", - "How do you create a new DiffExecutor instance?\n", - "What happens in the `run_target` method of the DiffExecutor?\n", - "What is the purpose of the SIGNALS and SIGNALS_PTR static variables?\n", - "How does the harness closure work?\n", - "What is the purpose of the StdMapObserver?\n", - "What feedbacks are used in this fuzzer?\n", - "How is the initial corpus generated?\n", - "What stages are used in the fuzzer?\n", - "What does the `OomObserver` struct do?\n", - "How do I create a new `OomObserver`?\n", - "What does the function `both_require` do?\n", - "How does `both_require` check for the sequence 'a', 'b', 'c'?\n", - "What is the purpose of the `len` parameter in `both_require`?\n", - "What does the vuln() function do?\n", - "How is input normally read in the main() function?\n", - "What input condition causes the program to abort in the main() function?\n", - "What is the purpose of the `MergeScheduler` struct?\n", - "How does the `on_add` method of the `MergeScheduler` work?\n", - "What is the purpose of the `removable()` method in the `MergeScheduler`?\n", - "How does the `on_remove` method of the `MergeScheduler` work?\n", - "What is the purpose of the `current()` method in the `MergeScheduler`?\n", - "Why is the `next()` method of the `MergeScheduler` unimplemented?\n", - "How are the `Fp` and `Lr` registers defined as aliases in the `Regs` enum?\n", - "What is the purpose of the `get_backdoor_arch_regs` function?\n", - "How do I get the `EnumMap` of backdoor architecture registers?\n", - "How do you convert a `NautilusInput` to a `BytesInput`?\n", - "How do you get the `Tree` representation of a `NautilusInput`?\n", - "What traits does `NautilusInput` implement?\n", - "How do you initialize the logger?\n", - "How is the log file created?\n", - "What logger implementation is being used?\n", - "How do you register a new type in the Registry?\n", - "What is the purpose of the `_real_register` method?\n", - "How can you retrieve registered types from the Registry?\n", - "How are targeted types handled in the Registry?\n", - "What is the purpose of the `_modules` set in the Registry?\n", - "What does the Octal class do?\n", - "How does the decode method of the Octal class work?\n", - "What does the getTarget method of the Octal class do?\n", - "What external dependencies does the Octal class have?\n", - "How does the decode method of the A1z26 class work?\n", - "What is the purpose of the priority method in the A1z26 class?\n", - "What does the getParams method do in the A1z26 class?\n", - "What is the purpose of the getTarget method?\n", - "How are the delimiters in the input ciphertext handled?\n", - "What is the purpose of the priority method in the Base58_ripple class?\n", - "What does the getParams method of the Base58_ripple class do?\n", - "What is the purpose of the getTarget method in the Base58_ripple class?\n", - "How are the character and word boundaries determined in the Morse code decoding process?\n", - "What is the purpose of the priority method in the Morse_code class?\n", - "What does the `getInfo` method of the `Soundex` class do?\n", - "What does the `getTarget` method of the `Soundex` class return?\n", - "How does the `attemptCrack` method of the `Soundex` class attempt to crack a Soundex-encoded ciphertext?\n", - "What does the `sortlistwithdict` method of the `Soundex` class do?\n", - "What parameters does the `Soundex` class take in its constructor?\n", - "What parameters can be configured for the Tap_code decoder?\n", - "How does the CipheyDists class handle configuration?\n", - "What is the priority method used for in the Base69 class?\n", - "How are the parameters for the Base69 class specified?\n", - "What encryption schemes do the tests cover?\n", - "What is the expected decrypted plaintext used in most of the tests?\n", - "What does the MakeBools function return?\n", - "How does the MakeFixedStrings function work?\n", - "What is the purpose of the long string in the MakeStrings function?\n", - "What UUID values are returned by the MakeUUIDs function?\n", - "How can I append a column to a ColumnTuple?\n", - "How do I load column data from an input stream into a ColumnTuple?\n", - "How can I clear the data in a ColumnTuple?\n", - "How do I get the number of rows in a ColumnTuple?\n", - "What is the purpose of the ColumnTupleT class?\n", - "How can you append elements to a ColumnIPv4 instance?\n", - "How can you access elements from a ColumnIPv4 instance?\n", - "How can you append the content of another column to a ColumnIPv4 instance?\n", - "How can you get the number of rows in a ColumnIPv4 instance?\n", - "How can you create a slice of a ColumnIPv4 instance?\n", - "What does the GetTypeMeta() function do?\n", - "How does the CompateStringsCaseInsensitive() function compare two strings case-insensitively?\n", - "What regular expression syntax is supported on Windows and Mac for death tests?\n", - "What is a known caveat with \"threadsafe\" style death tests?\n", - "How do you read a string using WireFormat?\n", - "How do you read a 64-bit unsigned integer using WireFormat?\n", - "What is the purpose of the LoadPrefix function in the Column class?\n", - "What is the purpose of the SavePrefix function in the Column class?\n", - "How does the Save function in the Column class work?\n", - "How does the ColumnLowCardinality class handle null values?\n", - "What geometric data types are supported by the code?\n", - "How can you append an element to a ColumnGeo?\n", - "How can you access an element in a ColumnGeo?\n", - "How can you append the content of one ColumnGeo to another?\n", - "How can you clear the data of a ColumnGeo?\n", - "How do you construct a ProjectedIterator?\n", - "How do you increment and decrement a ProjectedIterator?\n", - "What are the possible values for the ConsoleOutput enum?\n", - "What package is the ConsoleOutput enum defined in?\n", - "What do the different values of the ConsoleOutput enum represent?\n", - "How does the UpdateChecker store the timestamp of the last update check?\n", - "What does the UpdateChecker return if the current version is up to date?\n", - "How does the DefaultCredentialRetrievers class handle credential helpers on Windows?\n", - "How does the DefaultCredentialRetrievers class avoid duplicate CredentialRetriever instances?\n", - "What does the `ReproducibleImageTest` test class verify?\n", - "How does the `createImage()` method create the test image?\n", - "What is the expected tarball structure and how is it verified in `testTarballStructure()`?\n", - "What package does the HelloWorld class belong to?\n", - "What license is this code released under?\n", - "What year was this code copyrighted?\n", - "What company owns the copyright to this code?\n", - "How does MavenSettingsServerCredentials infer credentials for a server?\n", - "What exceptions can be thrown when inferring credentials with MavenSettingsServerCredentials?\n", - "What is the format of the returned AuthProperty when inferring credentials?\n", - "How are the test settings files used in the tests?\n", - "How does the testPull() method verify the correctness of the pulled BLOB?\n", - "How is the RegistryClient instance created in the test methods?\n", - "How does JibBuildRunner handle a RegistryUnauthorizedException with a 403 Forbidden status code?\n", - "How does the buildToDockerDaemonAndRun method verify the built image?\n", - "How does the testExecute_dockerClient test work?\n", - "What exception is thrown when registry authentication fails?\n", - "What information is included in the exception message when a `RegistryAuthenticationFailedException` is thrown?\n", - "What is the default length of a generated pepper when no length is specified?\n", - "Is it possible to generate a pepper with a length of zero?\n", - "What is the expected value of the pepper returned by the PepperGenerator.get() method?\n", - "What is the purpose of the slowEquals method that takes two CharSequence objects?\n", - "How does the hash method handle the presence or absence of a salt value?\n", - "What are the input parameters for the BalloonHashingFunction constructor?\n", - "How can I obtain an instance of the BalloonHashingFunction using the factory method?\n", - "What are the test vectors used in the `TEST_VECTORS` array?\n", - "What is the maximum digest size supported by this Blake2b implementation?\n", - "How do you reset the hasher to its initial state?\n", - "What class does BadParametersException extend?\n", - "What package does the BadParametersException class belong to?\n", - "Who is the author of the BadParametersException class?\n", - "Since which version has the BadParametersException class been available?\n", - "What is the purpose of the Hash class?\n", - "What information does the Hash class store?\n", - "How do I create a HashBuilder instance?\n", - "How do I specify the hashing algorithm to use with the HashBuilder?\n", - "What is the difference between withPBKDF2() and withCompressedPBKDF2()?\n", - "How do you create an instance of MessageDigestFunction with a specific hashing algorithm and salt option?\n", - "How do you hash a password using MessageDigestFunction?\n", - "How do you check if a password matches a hashed value using MessageDigestFunction?\n", - "How can you retrieve the hashing algorithm and salt option used by a MessageDigestFunction instance?\n", - "What does the test method `issue92()` do?\n", - "What does the test method `issue99()` do?\n", - "What does the test method `issue93()` do?\n", - "What does the test method `issue120()` do?\n", - "Which package does the Argon2 enum belong to?\n", - "What is the purpose of the `Tag` class?\n", - "What is the purpose of the `pull` function in the `Tag` class?\n", - "How does the Serializer class serialize branch control instructions like br and br_if?\n", - "What is the purpose of the `Log` class?\n", - "What is the expected behavior of the `body` method of the `Log` class?\n", - "What is the purpose of the `procRaise` function in the `Environ` class?\n", - "How does the `procRaise` function handle unsupported signals?\n", - "What is the purpose of the `printPluginMock` function?\n", - "What is the purpose of the `body` method in the mock classes?\n", - "How do you access the magic bytes of a Module?\n", - "What sections are contained in a Module?\n", - "How do you check if a Module has been validated?\n", - "How can I retrieve the current log level using the provided classes?\n", - "How can I set an integer option value using the provided classes?\n", - "What is the purpose of the AVRescaleQ class?\n", - "What is the purpose of the WasmEdge::PO namespace?\n", - "What error codes are defined in the ErrCode enum?\n", - "What data does the Error class store?\n", - "How can I construct an Error object?\n", - "How can I access the error code and message of an Error object?\n", - "How can I set the ID of a specific chapter using the provided functions?\n", - "How can I set the time base of a specific chapter?\n", - "What does the AVChapterStart function do?\n", - "What is the purpose of the `importPk` function?\n", - "How does the `pkExportData` function work?\n", - "How do you create a new instance of the `Pty` struct?\n", - "How can you access the child process watcher associated with a `Pty` instance?\n", - "What does the `merge` function do?\n", - "How does the `merge_tables` function work?\n", - "What does the `merge_sequence` test verify?\n", - "How does the `get_pw_entry` function work?\n", - "What fields does the `Mouse` struct contain?\n", - "How are the default mouse bindings obtained in the `default` implementation of `MouseBindings`?\n", - "How is deserialization handled for the `MouseBindings` struct?\n", - "What is the purpose of the `MouseBinding` type?\n", - "What is the purpose of the `new_nop` method?\n", - "How does the Scheduler schedule a new event?\n", - "How can you cancel a scheduled event?\n", - "How can you check if a timer is already scheduled?\n", - "What does the `attach_handler` function do?\n", - "How do you create a new `Row` instance?\n", - "What does the `grow` method do in the `Row` struct?\n", - "How does the `shrink` method work in the `Row` struct?\n", - "What is the purpose of the `Debug` struct?\n", - "How can I specify the renderer preference in Alacritty?\n", - "What is the purpose of the Display struct?\n", - "How does the Display struct handle configuration updates?\n", - "What is the purpose of the FrameTimer struct?\n", - "What class does HasWeightCol extend?\n", - "What is the name and description of the weight column parameter?\n", - "How can I get the value of the weight column parameter?\n", - "What is the default value of the weight column parameter?\n", - "What parameters does the VectorSlicer class have?\n", - "How do I specify the indices to slice from the input vector?\n", - "What restrictions are there on the indices parameter?\n", - "How does an operator or UDF get access to the current epoch number?\n", - "Are the vector and l2Norm fields mutable in the VectorWithNorm class?\n", - "How do you set the parameters of the UnivariateFeatureSelector?\n", - "What happens if you don't set the feature_type or label_type parameters of the UnivariateFeatureSelector?\n", - "What is the output schema of the UnivariateFeatureSelectorModel after transforming data?\n", - "How does the testParam() method verify the parameter settings of LinearRegression?\n", - "What does the testInputTypeConversion() method verify?\n", - "What does the testRegularization() method check?\n", - "What is the maximum size of a segment that the MemorySegmentWriter will write?\n", - "What is the purpose of the HeadOperatorCoordinator class?\n", - "How does the HeadOperatorCoordinator determine when to send out a GloballyAlignedEvent?\n", - "How is the input data generated in this example?\n", - "How are the results extracted and displayed in this example?\n", - "What execution environment is used in this example?\n", - "How do you create an IndexToStringModel instance?\n", - "How do you set the model data for the IndexToStringModel?\n", - "How do you extract and display the results after applying the IndexToStringModel?\n", - "How do you create a new instance of the ColorEndPatternConverter?\n", - "What parameters does the ColorEndPatternConverter constructor take?\n", - "What namespaces are used in this file?\n", - "What does the XMLFilenameFilter constructor do?\n", - "What is the purpose of the pattern and replacement strings constructed in the XMLFilenameFilter constructor?\n", - "How would the XMLFilenameFilter be used?\n", - "What C++ standard library classes are used in this code?\n", - "How does the testTrigger unit test work?\n", - "How does the testValid unit test work?\n", - "How do you configure the logging system with a specific layout using BasicConfigurator?\n", - "How do you configure the logging system with a specific appender using BasicConfigurator?\n", - "How do you reset the logging configuration to its default state using BasicConfigurator?\n", - "What is the default layout used by BasicConfigurator if no layout is provided?\n", - "What is the purpose of the WideLife template class used in the code?\n", - "How does the hexdump function handle different character types for logging?\n", - "How do you create a DenyAllFilter instance?\n", - "What do you need to do after creating a DenyAllFilter instance?\n", - "What is the signature of the decide() method of the DenyAllFilter?\n", - "What namespaces are used in this file?\n", - "What is the purpose of the MinimumTestCase class?\n", - "What is the purpose of the common() method?\n", - "How do you obtain an instance of NDCPatternConverter?\n", - "How does the NDCPatternConverter format the logging event?\n", - "What is the inheritance hierarchy of NDCPatternConverter?\n", - "What macros are used in the NDCPatternConverter class declaration?\n", - "How does the test1() method test the FMTLayout?\n", - "What is the purpose of the common() method?\n", - "How do you construct a BufferedWriter object?\n", - "What methods does BufferedWriter override from its parent class?\n", - "How does BufferedWriter handle object destruction?\n", - "Pass@5: 0.821716589861751\n" - ] - } - ], + "outputs": [], "source": [ "def load_jsonl(file_path: str):\n", " \"\"\"Load JSONL file and return a list of dictionaries.\"\"\"\n", @@ -654,7 +404,6 @@ "for query_item in dataset:\n", "\n", " query = query_item[\"query\"]\n", - " print(query)\n", "\n", " golden_chunk_uuids = query_item[\"golden_chunk_uuids\"]\n", "\n", @@ -689,7 +438,23 @@ " query_score = chunks_found / len(golden_contents)\n", "\n", " total_query_score += query_score\n", - " num_queries += 1\n", + " num_queries += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pass@5: 0.7911386328725037\n" + ] + } + ], + "source": [ "print(\"Pass@5: \", total_query_score / num_queries)" ] }