From 36bd814d2c4690db8972bce74c1d9cbdd139dd58 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Mon, 30 Oct 2023 11:17:55 -0400 Subject: [PATCH 1/7] SOLR-16835: Introduce Python client generation Generation currently uses the default openapi-generator template for Python, and occurs in the 'api' module following OAS generation. --- solr/api/build.gradle | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/solr/api/build.gradle b/solr/api/build.gradle index 8e50d634485..852bf7b6921 100644 --- a/solr/api/build.gradle +++ b/solr/api/build.gradle @@ -17,6 +17,7 @@ plugins { id 'io.swagger.core.v3.swagger-gradle-plugin' version '2.2.2' + id "org.openapi.generator" version "6.0.1" } apply plugin: 'java-library' @@ -55,8 +56,24 @@ dependencies { testImplementation 'org.apache.lucene:lucene-test-framework' } +// Ensure the OAS is available to other modules who want to generate code (i.e. solrj) artifacts { openapiSpec resolve.outputDir, { builtBy resolve } } + +// Non-Java client generation tasks below: + +task buildPythonClient(type: org.openapitools.generator.gradle.plugin.tasks.GenerateTask) { + generatorName.set("python") + inputSpec.set("$openApiSpecFile") + outputDir.set("${buildDir}/generated/python") + packageName.set("solr") + generateApiTests.set(false) + generateModelTests.set(false) +} + +tasks.withType(org.openapitools.generator.gradle.plugin.tasks.GenerateTask) { + dependsOn(resolve) +} From 8c6179932f994dc45eed7d81da24786dda124ef1 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Wed, 29 Nov 2023 16:04:09 -0500 Subject: [PATCH 2/7] Make Python available to ltr example script --- solr/api/build.gradle | 9 ++++++++- solr/modules/ltr/build.gradle | 24 ++++++++++++++++++++++++ solr/modules/ltr/example/.gitignore | 1 + solr/modules/ltr/example/README.md | 12 +++++++++--- 4 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 solr/modules/ltr/example/.gitignore diff --git a/solr/api/build.gradle b/solr/api/build.gradle index 6a05b8c46f4..ecd93125e86 100644 --- a/solr/api/build.gradle +++ b/solr/api/build.gradle @@ -40,6 +40,10 @@ configurations { canBeConsumed = true canBeResolved = false } + pythonClient { + canBeConsumed = true + canBeResolved = false + } } resolve { @@ -91,8 +95,11 @@ artifacts { builtBy resolve } - // Makes our Javascript client available to the Admin UI build + // Makes generated clients available to other build modules jsClient file(project.jsClientDir), { builtBy buildJSClient } + pythonClient file(project.pythonClientDir), { + builtBy buildPythonClient + } } diff --git a/solr/modules/ltr/build.gradle b/solr/modules/ltr/build.gradle index ba521258281..8faf27b6433 100644 --- a/solr/modules/ltr/build.gradle +++ b/solr/modules/ltr/build.gradle @@ -19,6 +19,18 @@ apply plugin: 'java-library' description = 'Learning to Rank Package' +ext { + asdf = layout.projectDirectory.dir("example").dir("solrclient") + pythonClientBundleDir = layout.projectDirectory.dir("example/solrclient") + +} + +configurations { + generatedPythonClient + // TODO Is 'bundle' really the right word here? + generatedPythonClientBundle +} + dependencies { implementation project(':solr:core') implementation project(':solr:solrj') @@ -27,6 +39,12 @@ dependencies { implementation 'org.slf4j:slf4j-api' + // Used by example scripts + generatedPythonClient project(path: ":solr:api", configuration: "pythonClient") + generatedPythonClientBundle files(pythonClientBundleDir) { + builtBy "syncPythonClientSourceCode" + } + testImplementation('org.mockito:mockito-core', { exclude group: "net.bytebuddy", module: "byte-buddy-agent" }) @@ -42,3 +60,9 @@ dependencies { testImplementation 'commons-io:commons-io' } + +task syncPythonClientSourceCode(type: Sync) { + group = "Solr Python Client" + from configurations.generatedPythonClient + into project.pythonClientBundleDir +} diff --git a/solr/modules/ltr/example/.gitignore b/solr/modules/ltr/example/.gitignore new file mode 100644 index 00000000000..8990f048d28 --- /dev/null +++ b/solr/modules/ltr/example/.gitignore @@ -0,0 +1 @@ +solrclient/ diff --git a/solr/modules/ltr/example/README.md b/solr/modules/ltr/example/README.md index 7cd66484f2f..a1f3d7e1b46 100644 --- a/solr/modules/ltr/example/README.md +++ b/solr/modules/ltr/example/README.md @@ -31,9 +31,15 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ Alternatively, leave the `config.json` file unchanged and create a soft-link to your `liblinear` directory e.g. - `ln -s /Users/YourNameHere/Downloads/liblinear-2.1 ./modules/ltr/example/liblinear` + `ln -s /Users/YourNameHere/Downloads/liblinear-2.1 ./modules/ltr/example/liblinear`A -3. Extract features, train a reranking model, and deploy it to Solr. +3. Prepare your Python3 environment to run the training script. + + `./gradlew solr:modules:ltr:syncPythonClientSourceCode` + + This installs a Python client used to talk to Solr, making it and its dependencies available to the training script used by the step below. + +4. Extract features, train a reranking model, and deploy it to Solr. `cd modules/ltr/example` @@ -43,7 +49,7 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ document pairs of "userQueriesFile" and merges it with the features extracted from Solr into a training file. That file is used to train a linear model, which is then deployed to Solr for you to rerank results. -4. Search and rerank the results using the trained model +5. Search and rerank the results using the trained model ``` http://localhost:8983/solr/techproducts/query?q=test&rq={!ltr%20model=exampleModel%20reRankDocs=25%20efi.user_query=%27test%27}&fl=price,score,name From 5725e82388cd190f0337f2648c26924c8f0548b4 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Thu, 30 Nov 2023 13:09:03 -0500 Subject: [PATCH 3/7] Use generated client in ltr example script --- solr/modules/ltr/example/README.md | 4 +- .../example/train_and_upload_demo_model.py | 76 +++++++++---------- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/solr/modules/ltr/example/README.md b/solr/modules/ltr/example/README.md index a1f3d7e1b46..edb417bdb65 100644 --- a/solr/modules/ltr/example/README.md +++ b/solr/modules/ltr/example/README.md @@ -37,7 +37,9 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ `./gradlew solr:modules:ltr:syncPythonClientSourceCode` - This installs a Python client used to talk to Solr, making it and its dependencies available to the training script used by the step below. + `pushd solrclient && python3 setup.py install --user && popd` + + This installs a Python client used to talk to Solr, making it and its dependencies available to the training script used below. 4. Extract features, train a reranking model, and deploy it to Solr. diff --git a/solr/modules/ltr/example/train_and_upload_demo_model.py b/solr/modules/ltr/example/train_and_upload_demo_model.py index 3258f82869a..a221abc4e7f 100755 --- a/solr/modules/ltr/example/train_and_upload_demo_model.py +++ b/solr/modules/ltr/example/train_and_upload_demo_model.py @@ -8,8 +8,15 @@ from optparse import OptionParser -solrQueryUrl = "" +import solr +from solr.api import querying_api +def setupSolrClient(host, port): + '''Configures the Solr client with the specified Solr host/port''' + solr_client_config = solr.Configuration( + host = "http://" + host + ":" + str(port) + "/api" + ) + solr.Configuration.set_default(solr_client_config) def setupSolr(collection, host, port, featuresFile, featureStoreName): '''Sets up solr with the proper features for the test''' @@ -46,48 +53,36 @@ def setupSolr(collection, host, port, featuresFile, featureStoreName): conn.close() -def generateQueries(userQueriesFile, collection, requestHandler, solrFeatureStoreName, efiParams): +def generateQueries(userQueriesFile, solrFeatureStoreName, efiParams): with open(userQueriesFile) as input: - solrQueryUrls = [] #A list of tuples with solrQueryUrl,solrQuery,docId,scoreForPQ,source + solrQueryInfo = [] #A list of tuples with solrQueryBody,queryText,docId,scoreForPQ,source for line in input: line = line.strip(); searchText,docId,score,source = line.split("|"); - solrQuery = generateHttpRequest(collection,requestHandler,solrFeatureStoreName,efiParams,searchText,docId) - solrQueryUrls.append((solrQuery,searchText,docId,score,source)) + solrQueryBody = generateQueryBody(solrFeatureStoreName,efiParams,searchText,docId) + solrQueryInfo.append((solrQueryBody,searchText,docId,score,source)) + return solrQueryInfo; - return solrQueryUrls; +def generateQueryBody(solrFeatureStoreName, efiParams, searchText, docId): + concreteEfiParams = efiParams.replace("$USERQUERY", searchText.strip()) + featuresTransformer = "[features store=" + solrFeatureStoreName + " " + concreteEfiParams + "]" + solrJsonParams = { + "query": "id:" + docId, + "fields": ["id", "score", featuresTransformer] + } -def generateHttpRequest(collection, requestHandler, solrFeatureStoreName, efiParams, searchText, docId): - global solrQueryUrl - if len(solrQueryUrl) < 1: - solrQueryUrl = "/".join([ "", "solr", collection, requestHandler ]) - solrQueryUrl += ("?fl=" + ",".join([ "id", "score", "[features store="+solrFeatureStoreName+" "+efiParams+"]" ])) - solrQueryUrl += "&q=" - solrQueryUrl = solrQueryUrl.replace(" ","+") - solrQueryUrl += urllib.parse.quote_plus("id:") + return solrJsonParams - userQuery = urllib.parse.quote_plus(searchText.strip().replace("'","\\'").replace("/","\\\\/")) - solrQuery = solrQueryUrl + '"' + urllib.parse.quote_plus(docId) + '"' #+ solrQueryUrlEnd - solrQuery = solrQuery.replace("%24USERQUERY", userQuery).replace('$USERQUERY', urllib.parse.quote_plus("\\'" + userQuery + "\\'")) - - return solrQuery - - -def generateTrainingData(solrQueries, host, port): +def generateTrainingData(solrQueries, host, port, coreName): '''Given a list of solr queries, yields a tuple of query , docId , score , source , feature vector for each query. Feature Vector is a list of strings of form "key=value"''' - conn = http.client.HTTPConnection(host, port) - headers = {"Connection":" keep-alive"} - try: - for queryUrl,query,docId,score,source in solrQueries: - conn.request("GET", queryUrl, headers=headers) - r = conn.getresponse() - msg = r.read() - msgDict = json.loads(msg) + queryClient = querying_api.QueryingApi() + for solrQueryBody,query,docId,score,source in solrQueries: + msgDict = queryClient.json_query("cores", coreName, solrQueryBody) fv = "" docs = msgDict['response']['docs'] if len(docs) > 0 and "[features]" in docs[0]: @@ -101,19 +96,17 @@ def generateTrainingData(solrQueries, host, port): print("ERROR FOR: " + docId); print(msg) continue; - - if r.status == http.client.OK: - #print "http connection was ok for: " + queryUrl - yield(query,docId,score,source,fv.split(",")); - else: - raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) + if msgDict.get("response_header") != None: + status = msgDict.get("response_header").get("status") + if status == 0: + #print "http connection was ok for: " + queryUrl + yield(query,docId,score,source,fv.split(",")); + else: + raise Exception("Status: {0} \nResponse: {2}".format(status, msgDict)) except Exception as e: print(msg) print(e) - conn.close() - - def uploadModel(collection, host, port, modelFile, modelName): modelUrl = "/solr/" + collection + "/schema/model-store" headers = {'Content-type': 'application/json'} @@ -156,13 +149,14 @@ def main(argv=None): config = json.load(configFile) print("Uploading features ("+config["solrFeaturesFile"]+") to Solr") + setupSolrClient(config["host"], config["port"]) setupSolr(config["collection"], config["host"], config["port"], config["solrFeaturesFile"], config["solrFeatureStoreName"]) print("Converting user queries ("+config["userQueriesFile"]+") into Solr queries for feature extraction") - reRankQueries = generateQueries(config["userQueriesFile"], config["collection"], config["requestHandler"], config["solrFeatureStoreName"], config["efiParams"]) + reRankQueries = generateQueries(config["userQueriesFile"], config["solrFeatureStoreName"], config["efiParams"]) print("Running Solr queries to extract features") - fvGenerator = generateTrainingData(reRankQueries, config["host"], config["port"]) + fvGenerator = generateTrainingData(reRankQueries, config["host"], config["port"], config["collection"]) formatter = libsvm_formatter.LibSvmFormatter(); formatter.processQueryDocFeatureVector(fvGenerator,config["trainingFile"]); From 2270b9d2c6b63f52fbff72a9ffec31d444b47b94 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Thu, 30 Nov 2023 13:18:58 -0500 Subject: [PATCH 4/7] Gradle cleanup --- solr/modules/ltr/build.gradle | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/solr/modules/ltr/build.gradle b/solr/modules/ltr/build.gradle index 8faf27b6433..022cf99778b 100644 --- a/solr/modules/ltr/build.gradle +++ b/solr/modules/ltr/build.gradle @@ -20,15 +20,12 @@ apply plugin: 'java-library' description = 'Learning to Rank Package' ext { - asdf = layout.projectDirectory.dir("example").dir("solrclient") - pythonClientBundleDir = layout.projectDirectory.dir("example/solrclient") - + pythonClientCopyDir = layout.projectDirectory.dir("example/solrclient") } configurations { generatedPythonClient - // TODO Is 'bundle' really the right word here? - generatedPythonClientBundle + localPythonClientCopy } dependencies { @@ -41,8 +38,8 @@ dependencies { // Used by example scripts generatedPythonClient project(path: ":solr:api", configuration: "pythonClient") - generatedPythonClientBundle files(pythonClientBundleDir) { - builtBy "syncPythonClientSourceCode" + localPythonClientCopy files(pythonClientCopyDir) { + builtBy "copyPythonClientToExample" } testImplementation('org.mockito:mockito-core', { @@ -61,8 +58,8 @@ dependencies { testImplementation 'commons-io:commons-io' } -task syncPythonClientSourceCode(type: Sync) { +task copyPythonClientToExample(type: Sync) { group = "Solr Python Client" from configurations.generatedPythonClient - into project.pythonClientBundleDir + into project.pythonClientCopyDir } From 8d91d216d4dbd1c461af97f4506657356f2058b0 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Thu, 30 Nov 2023 13:22:22 -0500 Subject: [PATCH 5/7] Remove unused arguments --- solr/modules/ltr/example/train_and_upload_demo_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solr/modules/ltr/example/train_and_upload_demo_model.py b/solr/modules/ltr/example/train_and_upload_demo_model.py index a221abc4e7f..31d7532dd94 100755 --- a/solr/modules/ltr/example/train_and_upload_demo_model.py +++ b/solr/modules/ltr/example/train_and_upload_demo_model.py @@ -76,7 +76,7 @@ def generateQueryBody(solrFeatureStoreName, efiParams, searchText, docId): return solrJsonParams -def generateTrainingData(solrQueries, host, port, coreName): +def generateTrainingData(solrQueries, coreName): '''Given a list of solr queries, yields a tuple of query , docId , score , source , feature vector for each query. Feature Vector is a list of strings of form "key=value"''' try: @@ -156,7 +156,7 @@ def main(argv=None): reRankQueries = generateQueries(config["userQueriesFile"], config["solrFeatureStoreName"], config["efiParams"]) print("Running Solr queries to extract features") - fvGenerator = generateTrainingData(reRankQueries, config["host"], config["port"], config["collection"]) + fvGenerator = generateTrainingData(reRankQueries, config["collection"]) formatter = libsvm_formatter.LibSvmFormatter(); formatter.processQueryDocFeatureVector(fvGenerator,config["trainingFile"]); From b5e47be72ee85647c7f8ffedcf903bb20cccf0e8 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Thu, 30 Nov 2023 13:25:02 -0500 Subject: [PATCH 6/7] Resolve typo --- solr/modules/ltr/example/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/modules/ltr/example/README.md b/solr/modules/ltr/example/README.md index edb417bdb65..146f5bf6d48 100644 --- a/solr/modules/ltr/example/README.md +++ b/solr/modules/ltr/example/README.md @@ -31,7 +31,7 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ Alternatively, leave the `config.json` file unchanged and create a soft-link to your `liblinear` directory e.g. - `ln -s /Users/YourNameHere/Downloads/liblinear-2.1 ./modules/ltr/example/liblinear`A + `ln -s /Users/YourNameHere/Downloads/liblinear-2.1 ./modules/ltr/example/liblinear` 3. Prepare your Python3 environment to run the training script. From 222c5bd46a9dade93245d5b76e5763ccd31db543 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Tue, 5 Dec 2023 10:21:35 -0500 Subject: [PATCH 7/7] Minor example-README tweak --- solr/modules/ltr/example/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/modules/ltr/example/README.md b/solr/modules/ltr/example/README.md index 146f5bf6d48..f4220ca4982 100644 --- a/solr/modules/ltr/example/README.md +++ b/solr/modules/ltr/example/README.md @@ -37,7 +37,7 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ `./gradlew solr:modules:ltr:syncPythonClientSourceCode` - `pushd solrclient && python3 setup.py install --user && popd` + `cd solr/modules/ltr/example/solrclient && python3 setup.py install --user && cd -` This installs a Python client used to talk to Solr, making it and its dependencies available to the training script used below.