-
Notifications
You must be signed in to change notification settings - Fork 0
/
ap.py
204 lines (175 loc) · 10.4 KB
/
ap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import json
import numpy as np
import random
import os
import torch
import pandas as pd
import argparse
from utils import dump_jsonl, extract_number, get_api_response
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
data = []
encodings = []
count = 0
def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def get_args_parser():
parser = argparse.ArgumentParser('opinionqa', add_help=False)
parser.add_argument("--log_path", default="ap.log", type=str, help="Path to save the log")
parser.add_argument("--num_sample", default=600, type=int)
parser.add_argument("--prompt_type", choices=["with_persona", "no_persona", "no_confidence"], default="with_persona", type=str)
parser.add_argument("--persona_features", choices=["all_features", "least_imp_feature", "key_features"], default="all_features", type=str)
parser.add_argument("--model", choices = ["gpt-4", "gpt-4o", "gpt-3.5-turbo", "claude-3-sonnet-20240229", "command-r-plus", "meta-llama/Meta-Llama-3-70B-Instruct"], default="command-r-plus", type=str)
parser.add_argument("--output_dir", default="outputs", type=str)
return parser
if __name__ == '__main__':
args = get_args_parser()
args = args.parse_args()
jsonl_path = os.path.join(args.output_dir, f"ap_{args.prompt_type}_{args.persona_features}_{args.model}.jsonl".replace("/", "_"))
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
data_path = "./ap/synthetic_dataset.jsonl"
print("Loading model")
# sent_bert_tok = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
sent_bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to("cuda")
print("Model loaded")
assert not os.path.exists(jsonl_path)
if "llama" in args.model:
tok = AutoTokenizer.from_pretrained(args.model)
model = AutoModelForCausalLM.from_pretrained(args.model, device_map="auto", torch_dtype=torch.float16)
else:
tok = None
model = args.model
with open(data_path, 'r') as f:
tmp = f.readlines()
for idx, dp in enumerate(tmp):
persona = []
dp = json.loads(dp)
persona.append(f"The person is {dp['personality']['age']} years old")
persona.append(f"The person is {dp['personality']['sex']}")
persona.append(f"The person is living at {dp['personality']['city_country']}")
persona.append(f"The person is born in {dp['personality']['birth_city_country']}")
persona.append(f"The person's highest education level is {dp['personality']['education']}")
persona.append(f"The person's occupation is {dp['personality']['occupation']}")
persona.append(f"The person's income is {dp['personality']['income']}")
persona.append(f"The person is {dp['personality']['relationship_status']}")
persona = ". ".join(persona) + "."
dp['desc'] = persona
# import pdb; pdb.set_trace()
encoding = sent_bert_model.encode([persona])[0]
data.append(dp)
print(idx)
for dp in data:
similar_persona_idx = ""
similarity = 0
### Find the most similar persona
for idx, key in enumerate(encodings):
sim = cosine_similarity(dp['encoding'], key)
if sim > similarity:
similarity = sim
similar_persona_idx = idx
similar_persona = data[idx]['desc']
similar_response = data[idx]['response']
similar_question = data[idx]['question_asked']
if random.random() > 0.5:
response_A = dp['response']
response_B = similar_response
question_A = dp['question_asked']
question_B = similar_question
gt = "A"
else:
response_A = similar_response
response_B = dp['response']
question_A = similar_question
question_B = dp['question_asked']
gt = "B"
prompt = """Given the user profile provided below, select the question-answer pair of which the answer is most likely written by the user. Declare your choice by using the format: "[[A]]" if you believe Answer A is more suitable, or "[[B]]" if Answer B is better suited. Additionally, assess your confidence in this decision by assigning a certainty level from 1 to 100. Use the following guidelines to assign the certainty level:
1--20 (Uncertain): The user profile provides insufficient or Minimal evidence information suggests a preference. The decision is largely based on weak or indirect hints.
21--40 (Moderately Confident): There is noticeable evidence supporting a preference, though it is not comprehensive, and other interpretations are possible.
41--60 (Quite Confident): You find clear and convincing evidence that supports your prediction, though it is not entirely decisive.
61--80 (Confident): The user profile contains strong evidence that clearly supports your prediction, with very little ambiguity.
81--100 (Highly Confident): The user profile provides direct and explicit evidence that decisively supports your prediction.
Ensure you enclose your chosen certainty level in double brackets, like so: [[X]].
[User Profile]
{user_info}
[Question-Answer Pair A]
Question: {question_A}
Answer: {response_A}
[Question-Answer Pair B]
Question: {question_B}
Answer: {response_B}
[Answer]
[[""".format(user_info=dp['desc'], question_A=question_A, response_A=response_A, question_B=question_B, response_B=response_B)
# prompt = """For the questiona and response below, select the user profile that most likely respond to the question in that way. Declare your choice by using the format: "[[A]]" if you believe User A is more suitable, or "[[B]]" if User B is better suited. Additionally, assess your confidence in this decision by assigning a certainty level from 1 to 100. Use the following guidelines to assign the certainty level:
# 1--20 (Uncertain): The user profile provides insufficient or Minimal evidence information suggests a preference. The decision is largely based on weak or indirect hints.
# 21--40 (Moderately Confident): There is noticeable evidence supporting a preference, though it is not comprehensive, and other interpretations are possible.
# 41--60 (Quite Confident): You find clear and convincing evidence that supports your prediction, though it is not entirely decisive.
# 61--80 (Confident): The user profile contains strong evidence that clearly supports your prediction, with very little ambiguity.
# 81--100 (Highly Confident): The user profile provides direct and explicit evidence that decisively supports your prediction.
# Ensure you enclose your chosen certainty level in double brackets, like so: [[X]].
# [Question]
# {question}
# [User Response]
# {response}
# [User A's Profile]
# {user_info_a}
# [User B's Profile]
# {user_info_b}
# [Answer]
# [[""".format(question=dp['question_asked'], response=dp['response'], user_info_a=similar_persona, user_info_b=dp['desc'])
# res = get_api_response(prompt, model=model, tokenizer=tok, max_tokens=15)
# import pdb; pdb.set_trace()
# ans = res.replace("(", "").replace("[[", "").replace("Answer:", "").strip()[0]
def parse_res(res):
ans = res.replace("(", "").replace("[[", "").replace("Answer:", "").strip()[0]
if ans != "A" and ans != "B":
return False, False
try:
certainty = extract_number(res)
return ans, certainty
except:
return False, False
def get_api_response_with_parse(prompt, model, tok=None, max_tokens=15, temperature = 0.7, stop_strs = None, max_depth = 3, cur_depth = 0):
res = get_api_response(prompt, model=model, tokenizer=tok, max_tokens=max_tokens)
ans, certainty = parse_res(res)
if ans == False and cur_depth < max_depth:
print("regenerating")
return get_api_response_with_parse(prompt, model=model, tok=tok, max_tokens=max_tokens, temperature = temperature, stop_strs = stop_strs, max_depth = max_depth, cur_depth = cur_depth+1)
if cur_depth > 0 and cur_depth < max_depth:
print("regeneration succeed")
elif cur_depth == max_depth:
print("regeneration failed")
return ans, certainty, res
ans, certainty, res = get_api_response_with_parse(prompt, model=model, tok=tok, max_tokens=15)
acc = (ans == gt)
print("res: ", res)
print("ans: ", ans, gt)
dict = {"prompt": prompt, "answer": ans, "certainty": certainty, 'acc': acc, "ground_truth": gt, "user_info": persona, "res": res}
dump_jsonl(dict, jsonl_path)
# try:
# certainty = extract_number(res)
# print("certainty: ", certainty)
# dict = {"prompt": prompt, "answer": ans, "certainty": certainty, 'acc': acc, "ground_truth": gt, "user_info": persona, "res": res, "similar_persona": similar_persona, "similar_response": similar_response, "similar_question": similar_question}
# dump_jsonl(dict, jsonl_path)
# except:
# if "tie" in args.prompt_type or "no_persona" in args.prompt_type:
# certainty = "NA"
# dict = {"prompt": prompt, "answer": ans, "certainty": certainty, 'acc': acc, "ground_truth": gt, "user_info": persona, "res": res}
# dump_jsonl(dict, jsonl_path)
# else:
# print("Error")
count += 1
if count > args.num_sample:
break
df = pd.read_json(jsonl_path, lines=True)
df = df[(df['answer'] == "A") | (df['answer'] == "B")]
if df['certainty'][0] > 50:
df['certainty'] = df['certainty'].apply(lambda x: int(x//10))
grouped = df.groupby('certainty')['acc'].agg(Total_Responses='count', Correct_Responses=lambda x: x.sum(), Accuracy='mean')
with open(args.log_path, 'a') as f:
f.write(f"Model: {args.model}, Prompt Type: {args.prompt_type}, Persona Features: {args.persona_features}, Num Sample: {args.num_sample}\n")
f.write(f"{grouped}\n")
## Total Acc
f.write(f"\nTotal Accuracy: {df['acc'].mean()}\n")
f.write("\n-------------------------------------\n")
print("done")