-
Notifications
You must be signed in to change notification settings - Fork 1
/
likert.py
122 lines (107 loc) · 4.29 KB
/
likert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
from time import sleep
from tqdm import tqdm
import openai
import os
from templates import sentiment_likert_prompt, helpfulness_likert_prompt
MAX_API_RETRY = 5
REQ_TIME_GAP = 5
openai.api_key = os.getenv("OPENAI_API_KEY")
def get_eval(user_prompt: str, function_name: str):
sys_prompt = "You are a helpful and precise assistant for checking the quality of the answer."
functions = [
{
"name": "sentiment_likert",
"description": "Extracts the sentiment score of the response, plus a reasoning for that score.",
"parameters": {
"type": "object",
"properties": {
"sentiment": {
"type": "integer",
"description": "The score of the sentiment, from 1 to 5.",
},
"reasoning": {
"type": "string",
"description": "The reasoning for the score.",
},
},
"required": ["sentiment", "reasoning"],
},
},
{
"name": "helpfulness_likert",
"description": "Extracts the helpfulness score of the response, plus a reasoning for that score.",
"parameters": {
"type": "object",
"properties": {
"score": {
"type": "integer",
"description": "The score of the helpfulness, from 1 to 6.",
},
"reasoning": {
"type": "string",
"description": "The reasoning for the score.",
},
},
"required": ["score", "reasoning"],
},
},
]
for _ in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-0613",
messages=[
{"role": "system", "content": sys_prompt},
{"role": "user", "content": user_prompt},
],
temperature=0.0,
max_tokens=512,
function_call={"name": function_name},
functions=functions,
)
return json.loads(
response["choices"][0]["message"]["function_call"]["arguments"]
)
except Exception as e:
print(e)
sleep(REQ_TIME_GAP)
raise Exception(f"Failed after {MAX_API_RETRY} retries.")
def likert_score(prompts, outputs, prompt_template, function_name):
if isinstance(prompts, str) and isinstance(outputs, str):
prompts = [prompts]
outputs = [outputs]
assert len(prompts) == len(outputs), "Number of prompts and outputs must be equal"
data = []
for prompt, output in tqdm(zip(prompts, outputs)):
critique_dict = get_eval(
prompt_template.format(task=prompt, submission=output),
function_name=function_name,
)
data.append(critique_dict)
return data
if __name__ == "__main__":
print("Sentiment Likert")
scores = likert_score(
prompts=["Evaluate the sentiment of this movie review"] * 3,
outputs=[
"The film was simply amazing. The acting was great, the plot was interesting, and the cinematography was beautiful. I would recommend this movie to anyone who enjoys a good drama.",
"I'm not sure if I liked this movie. It was a bit too long and the plot was confusing. The acting was good, but the cinematography was a bit too dark. I would recommend this movie to anyone who enjoys a good drama.",
"What a waste of time. The acting was terrible, the plot was boring, and the cinematography was awful. I would not recommend this movie to anyone.",
],
prompt_template=sentiment_likert_prompt,
function_name="sentiment_likert",
)
print(scores)
print("Helpfulness Likert")
scores = likert_score(
prompts=["What is the capital of France?"] * 3,
outputs=[
"Paris is the capital of France",
"Paris, but I'm not sure",
"Madrid",
],
prompt_template=helpfulness_likert_prompt,
function_name="helpfulness_likert",
)
print(scores)