diff --git a/solr/api/build.gradle b/solr/api/build.gradle index b83b61683d5..ecd93125e86 100644 --- a/solr/api/build.gradle +++ b/solr/api/build.gradle @@ -26,6 +26,7 @@ description = 'API - Interfaces and classes used to represent Solrs APIs' ext { jsClientDir = "${buildDir}/generated/js" + pythonClientDir = "${buildDir}/generated/python" openApiSpecDir = "${buildDir}/generated/openapi" openApiSpecFile = "${project.openApiSpecDir}/openapi.json" } @@ -39,6 +40,10 @@ configurations { canBeConsumed = true canBeResolved = false } + pythonClient { + canBeConsumed = true + canBeResolved = false + } } resolve { @@ -62,7 +67,6 @@ dependencies { } // Non-Java client generation tasks below: - task buildJSClient(type: org.openapitools.generator.gradle.plugin.tasks.GenerateTask) { generatorName.set("javascript") inputSpec.set("$openApiSpecFile") @@ -72,6 +76,15 @@ task buildJSClient(type: org.openapitools.generator.gradle.plugin.tasks.Generate generateModelTests.set(false) } +task buildPythonClient(type: org.openapitools.generator.gradle.plugin.tasks.GenerateTask) { + generatorName.set("python") + inputSpec.set("$openApiSpecFile") + outputDir.set("$pythonClientDir") + packageName.set("solr") + generateApiTests.set(false) + generateModelTests.set(false) +} + tasks.withType(org.openapitools.generator.gradle.plugin.tasks.GenerateTask) { dependsOn(resolve) } @@ -82,8 +95,11 @@ artifacts { builtBy resolve } - // Makes our Javascript client available to the Admin UI build + // Makes generated clients available to other build modules jsClient file(project.jsClientDir), { builtBy buildJSClient } + pythonClient file(project.pythonClientDir), { + builtBy buildPythonClient + } } diff --git a/solr/modules/ltr/build.gradle b/solr/modules/ltr/build.gradle index ba521258281..022cf99778b 100644 --- a/solr/modules/ltr/build.gradle +++ b/solr/modules/ltr/build.gradle @@ -19,6 +19,15 @@ apply plugin: 'java-library' description = 'Learning to Rank Package' +ext { + pythonClientCopyDir = layout.projectDirectory.dir("example/solrclient") +} + +configurations { + generatedPythonClient + localPythonClientCopy +} + dependencies { implementation project(':solr:core') implementation project(':solr:solrj') @@ -27,6 +36,12 @@ dependencies { implementation 'org.slf4j:slf4j-api' + // Used by example scripts + generatedPythonClient project(path: ":solr:api", configuration: "pythonClient") + localPythonClientCopy files(pythonClientCopyDir) { + builtBy "copyPythonClientToExample" + } + testImplementation('org.mockito:mockito-core', { exclude group: "net.bytebuddy", module: "byte-buddy-agent" }) @@ -42,3 +57,9 @@ dependencies { testImplementation 'commons-io:commons-io' } + +task copyPythonClientToExample(type: Sync) { + group = "Solr Python Client" + from configurations.generatedPythonClient + into project.pythonClientCopyDir +} diff --git a/solr/modules/ltr/example/.gitignore b/solr/modules/ltr/example/.gitignore new file mode 100644 index 00000000000..8990f048d28 --- /dev/null +++ b/solr/modules/ltr/example/.gitignore @@ -0,0 +1 @@ +solrclient/ diff --git a/solr/modules/ltr/example/README.md b/solr/modules/ltr/example/README.md index 7cd66484f2f..f4220ca4982 100644 --- a/solr/modules/ltr/example/README.md +++ b/solr/modules/ltr/example/README.md @@ -33,7 +33,15 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ `ln -s /Users/YourNameHere/Downloads/liblinear-2.1 ./modules/ltr/example/liblinear` -3. Extract features, train a reranking model, and deploy it to Solr. +3. Prepare your Python3 environment to run the training script. + + `./gradlew solr:modules:ltr:syncPythonClientSourceCode` + + `cd solr/modules/ltr/example/solrclient && python3 setup.py install --user && cd -` + + This installs a Python client used to talk to Solr, making it and its dependencies available to the training script used below. + +4. Extract features, train a reranking model, and deploy it to Solr. `cd modules/ltr/example` @@ -43,7 +51,7 @@ Please refer to the Solr Reference Guide's section on [Learning To Rank](https:/ document pairs of "userQueriesFile" and merges it with the features extracted from Solr into a training file. That file is used to train a linear model, which is then deployed to Solr for you to rerank results. -4. Search and rerank the results using the trained model +5. Search and rerank the results using the trained model ``` http://localhost:8983/solr/techproducts/query?q=test&rq={!ltr%20model=exampleModel%20reRankDocs=25%20efi.user_query=%27test%27}&fl=price,score,name diff --git a/solr/modules/ltr/example/train_and_upload_demo_model.py b/solr/modules/ltr/example/train_and_upload_demo_model.py index 3258f82869a..31d7532dd94 100755 --- a/solr/modules/ltr/example/train_and_upload_demo_model.py +++ b/solr/modules/ltr/example/train_and_upload_demo_model.py @@ -8,8 +8,15 @@ from optparse import OptionParser -solrQueryUrl = "" +import solr +from solr.api import querying_api +def setupSolrClient(host, port): + '''Configures the Solr client with the specified Solr host/port''' + solr_client_config = solr.Configuration( + host = "http://" + host + ":" + str(port) + "/api" + ) + solr.Configuration.set_default(solr_client_config) def setupSolr(collection, host, port, featuresFile, featureStoreName): '''Sets up solr with the proper features for the test''' @@ -46,48 +53,36 @@ def setupSolr(collection, host, port, featuresFile, featureStoreName): conn.close() -def generateQueries(userQueriesFile, collection, requestHandler, solrFeatureStoreName, efiParams): +def generateQueries(userQueriesFile, solrFeatureStoreName, efiParams): with open(userQueriesFile) as input: - solrQueryUrls = [] #A list of tuples with solrQueryUrl,solrQuery,docId,scoreForPQ,source + solrQueryInfo = [] #A list of tuples with solrQueryBody,queryText,docId,scoreForPQ,source for line in input: line = line.strip(); searchText,docId,score,source = line.split("|"); - solrQuery = generateHttpRequest(collection,requestHandler,solrFeatureStoreName,efiParams,searchText,docId) - solrQueryUrls.append((solrQuery,searchText,docId,score,source)) + solrQueryBody = generateQueryBody(solrFeatureStoreName,efiParams,searchText,docId) + solrQueryInfo.append((solrQueryBody,searchText,docId,score,source)) + return solrQueryInfo; - return solrQueryUrls; +def generateQueryBody(solrFeatureStoreName, efiParams, searchText, docId): + concreteEfiParams = efiParams.replace("$USERQUERY", searchText.strip()) + featuresTransformer = "[features store=" + solrFeatureStoreName + " " + concreteEfiParams + "]" + solrJsonParams = { + "query": "id:" + docId, + "fields": ["id", "score", featuresTransformer] + } -def generateHttpRequest(collection, requestHandler, solrFeatureStoreName, efiParams, searchText, docId): - global solrQueryUrl - if len(solrQueryUrl) < 1: - solrQueryUrl = "/".join([ "", "solr", collection, requestHandler ]) - solrQueryUrl += ("?fl=" + ",".join([ "id", "score", "[features store="+solrFeatureStoreName+" "+efiParams+"]" ])) - solrQueryUrl += "&q=" - solrQueryUrl = solrQueryUrl.replace(" ","+") - solrQueryUrl += urllib.parse.quote_plus("id:") + return solrJsonParams - userQuery = urllib.parse.quote_plus(searchText.strip().replace("'","\\'").replace("/","\\\\/")) - solrQuery = solrQueryUrl + '"' + urllib.parse.quote_plus(docId) + '"' #+ solrQueryUrlEnd - solrQuery = solrQuery.replace("%24USERQUERY", userQuery).replace('$USERQUERY', urllib.parse.quote_plus("\\'" + userQuery + "\\'")) - - return solrQuery - - -def generateTrainingData(solrQueries, host, port): +def generateTrainingData(solrQueries, coreName): '''Given a list of solr queries, yields a tuple of query , docId , score , source , feature vector for each query. Feature Vector is a list of strings of form "key=value"''' - conn = http.client.HTTPConnection(host, port) - headers = {"Connection":" keep-alive"} - try: - for queryUrl,query,docId,score,source in solrQueries: - conn.request("GET", queryUrl, headers=headers) - r = conn.getresponse() - msg = r.read() - msgDict = json.loads(msg) + queryClient = querying_api.QueryingApi() + for solrQueryBody,query,docId,score,source in solrQueries: + msgDict = queryClient.json_query("cores", coreName, solrQueryBody) fv = "" docs = msgDict['response']['docs'] if len(docs) > 0 and "[features]" in docs[0]: @@ -101,19 +96,17 @@ def generateTrainingData(solrQueries, host, port): print("ERROR FOR: " + docId); print(msg) continue; - - if r.status == http.client.OK: - #print "http connection was ok for: " + queryUrl - yield(query,docId,score,source,fv.split(",")); - else: - raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) + if msgDict.get("response_header") != None: + status = msgDict.get("response_header").get("status") + if status == 0: + #print "http connection was ok for: " + queryUrl + yield(query,docId,score,source,fv.split(",")); + else: + raise Exception("Status: {0} \nResponse: {2}".format(status, msgDict)) except Exception as e: print(msg) print(e) - conn.close() - - def uploadModel(collection, host, port, modelFile, modelName): modelUrl = "/solr/" + collection + "/schema/model-store" headers = {'Content-type': 'application/json'} @@ -156,13 +149,14 @@ def main(argv=None): config = json.load(configFile) print("Uploading features ("+config["solrFeaturesFile"]+") to Solr") + setupSolrClient(config["host"], config["port"]) setupSolr(config["collection"], config["host"], config["port"], config["solrFeaturesFile"], config["solrFeatureStoreName"]) print("Converting user queries ("+config["userQueriesFile"]+") into Solr queries for feature extraction") - reRankQueries = generateQueries(config["userQueriesFile"], config["collection"], config["requestHandler"], config["solrFeatureStoreName"], config["efiParams"]) + reRankQueries = generateQueries(config["userQueriesFile"], config["solrFeatureStoreName"], config["efiParams"]) print("Running Solr queries to extract features") - fvGenerator = generateTrainingData(reRankQueries, config["host"], config["port"]) + fvGenerator = generateTrainingData(reRankQueries, config["collection"]) formatter = libsvm_formatter.LibSvmFormatter(); formatter.processQueryDocFeatureVector(fvGenerator,config["trainingFile"]);