diff --git a/flows/chainlit-ui-evaluation/Dockerfile b/flows/chainlit-ui-evaluation/Dockerfile index 12b8e9fd..d12472d2 100644 --- a/flows/chainlit-ui-evaluation/Dockerfile +++ b/flows/chainlit-ui-evaluation/Dockerfile @@ -42,7 +42,7 @@ RUN pip3 install keyrings.alt # Set up Connections RUN pf connection create --file ./openai.yaml --set api_key=$OPENAI_API_KEY --name open_ai_connection -RUN pf connection create --file ./azure_openai.yaml --set api_key=$OPENAI_API_KEY --set api_base=$OPENAI_API_ENDPOINT --name azure_openai +RUN pf connection create --file ./azure_openai.yaml --set api_key=$OPENAI_API_KEY --set api_base=$OPENAI_API_ENDPOINT --name azure_open_ai_connection RUN echo "DEBUG DOCKER" RUN which python diff --git a/flows/chainlit-ui-evaluation/azure_openai.yaml b/flows/chainlit-ui-evaluation/azure_openai.yaml index 5b916e77..ef5bed6c 100644 --- a/flows/chainlit-ui-evaluation/azure_openai.yaml +++ b/flows/chainlit-ui-evaluation/azure_openai.yaml @@ -1,5 +1,5 @@ $schema: https://azuremlschemas.azureedge.net/promptflow/latest/AzureOpenAIConnection.schema.json -name: open_ai_connection +name: azure_open_ai_connection type: azure_open_ai api_key: "" api_base: "" diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml index f9d9d237..6552cb2f 100644 --- a/flows/chainlit-ui-evaluation/flow.dag.yaml +++ b/flows/chainlit-ui-evaluation/flow.dag.yaml @@ -55,7 +55,7 @@ nodes: context: ${inputs.context} temperature: 1 model: gpt-4-turbo-preview - connection: open_ai_connection + connection: azure_open_ai_connection api: chat - name: concat_scores type: python diff --git a/templates/groundedness_score.jinja2 b/templates/groundedness_score.jinja2 index e69de29b..857f5eda 100644 --- a/templates/groundedness_score.jinja2 +++ b/templates/groundedness_score.jinja2 @@ -0,0 +1,41 @@ +System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +User: +You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 1: The ANSWER is logically false from the information contained in the CONTEXT. +3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. + +Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. + +Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. +Independent Examples: +## Example Task #1 Input: +{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is presented every other two years"} +## Example Task #1 Output: +1 +## Example Task #2 Input: +{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is very important awards in the entertainment industry in the United States. And it's also significant worldwide"} +## Example Task #2 Output: +5 +## Example Task #3 Input: +{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} +## Example Task #3 Output: +5 +## Example Task #4 Input: +{"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."} +## Example Task #4 Output: +1 + +Key points: + +- if the CONTEXT is a basic greeting, like "Hello" and the ANSWER is a basic greeting also, groundedness is 5 +- Focus on facts, not on the language used to present the facts +- If the ANSWER presents a fact, and the CONTEXT presents the same fact, groundedness is 5 + +Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context. + +## Actual Task Input: +{"CONTEXT": {{context}}, "ANSWER": {{answer}}} + +Actual Task Output: \ No newline at end of file