-
Notifications
You must be signed in to change notification settings - Fork 1
/
record_trajectories.py
490 lines (409 loc) · 25.5 KB
/
record_trajectories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
import argparse
import datetime
import os
import copy
import time
import json
import visdom
import torch
import numpy as np
import tempfile
from os.path import join as pjoin
from distutils.dir_util import copy_tree
import gym
import textworld
from textworld.gym import register_game, make_batch2
from agent import Agent
import generic
import reward_helper
import game_generator
import evaluate
from query import process_facts
import re
# Information outputted from environment besides observation
request_infos = textworld.EnvInfos(description=True,
inventory=True,
verbs=True,
location_names=True,
location_nouns=True,
location_adjs=True,
object_names=True,
object_nouns=True,
object_adjs=True,
facts=True,
last_action=True,
game=True,
admissible_commands=True,
extras=["object_locations", "object_attributes", "uuid"])
def train(variant):
"""
Train the agent
:variant: dictionary of command line values used to random rollout collection
"""
game_information_pattern = re.compile(r"(.*) <\|> -= (.*) = -(.*)<\|> (.*)")
data_path = variant["data_path"]
write_append_code = "a" if variant["append"] else "w"
with open(f"{variant['data_out_path']}/{variant['environment']}.json",write_append_code) as offline_rl_data:
time_1 = datetime.datetime.now()
agent = Agent()
step_in_total = 0
### For storing values for printing
running_avg_qa_reward = generic.HistoryScoreCache(capacity=500)
running_avg_sufficient_info_reward = generic.HistoryScoreCache(capacity=500)
running_avg_qa_loss = generic.HistoryScoreCache(capacity=500)
running_avg_correct_state_loss = generic.HistoryScoreCache(capacity=500)
###
output_dir, data_dir = ".", "."
json_file_name = agent.experiment_tag.replace(" ", "_")
best_sum_reward_so_far = 0.0
# load model from checkpoint
if agent.load_pretrained:
if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"):
agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt")
agent.update_target_net()
elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"):
agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt")
agent.update_target_net()
else:
print("Failed to load pretrained model... couldn't find the checkpoint file...")
# Create temporary folder for the generated games.
games_dir = tempfile.TemporaryDirectory(prefix="tw_games") # This is not deleted upon error. It would be better to use a with statement.
games_dir = pjoin(games_dir.name, "") # So path ends with '/'.
# copy grammar files into tmp folder so that it works smoothly
assert os.path.exists("./textworld_data"), "Oh no! textworld_data folder is not there..."
os.mkdir(games_dir)
os.mkdir(pjoin(games_dir, "textworld_data"))
copy_tree("textworld_data", games_dir + "textworld_data")
if agent.run_eval:
assert os.path.exists(pjoin(data_path, agent.testset_path)), "Oh no! test_set folder is not there..."
os.mkdir(pjoin(games_dir, agent.testset_path))
copy_tree(pjoin(data_path, agent.testset_path), pjoin(games_dir, agent.testset_path))
# Set queue size for unlimited games mode
if agent.train_data_size == -1:
game_queue_size = agent.batch_size * 5
game_queue = []
episode_no = 0
if agent.train_data_size == -1:
# endless mode
game_generator_queue = game_generator.game_generator_queue(path=games_dir, random_map=agent.random_map, question_type=agent.question_type, max_q_size=agent.batch_size * 2, nb_worker=8)
else:
# generate the training set
all_training_games = game_generator.game_generator(path=games_dir, random_map=agent.random_map, question_type=agent.question_type, train_data_size=agent.train_data_size)
all_training_games.sort()
all_env_ids = None
### GAME LOOP
json.dump(agent.word2id, open("word_encodings.json","w"))
while(True):
# Break when episode limit is reached
if episode_no > agent.max_episode:
break
# Change the seed for each episode
np.random.seed(episode_no)
if agent.train_data_size == -1:
# endless mode
for _ in range(agent.batch_size):
if not game_generator_queue.empty():
tmp_game = game_generator_queue.get()
if os.path.exists(tmp_game):
game_queue.append(tmp_game)
if len(game_queue) == 0:
time.sleep(0.1)
continue
can_delete_these = []
if len(game_queue) > game_queue_size:
can_delete_these = game_queue[:-game_queue_size]
game_queue = game_queue[-game_queue_size:]
sampled_games = np.random.choice(game_queue, agent.batch_size).tolist()
env_ids = [register_game(gamefile, request_infos=request_infos) for gamefile in sampled_games]
else:
if all_env_ids is None:
all_env_ids = [register_game(gamefile, request_infos=request_infos) for gamefile in all_training_games]
env_ids = np.random.choice(all_env_ids, agent.batch_size).tolist()
if len(env_ids) != agent.batch_size: # either less than or greater than
env_ids = np.random.choice(env_ids, agent.batch_size).tolist()
env_id = make_batch2(env_ids, parallel=True)
env = gym.make(env_id)
env.seed(episode_no)
# Start game - get initial observation and infos
obs, infos = env.reset()
batch_size = len(obs)
if not os.path.exists("possible_commands.json"):
with open("possible_commands.json","w") as possible_commands:
d= {**{cmd:list(set(infos[cmd][0])) for cmd in ["verbs","object_adjs","object_nouns"]}, **{"single_verbs" : list(agent.single_word_verbs), "double_verbs" : list(agent.two_word_verbs)}}
possible_commands.write(json.dumps(d))
possible_commands.flush()
# generate question-answer pairs here
questions, answers, reward_helper_info = game_generator.generate_qa_pairs(infos, question_type=agent.question_type, seed=episode_no)
print(f"========== Question: {questions[0]} | Question-type: {agent.question_type} | Answer: {answers[0]} | Episode: {episode_no} ==========")
# Put agent into train mode - for pytorch
agent.train()
# Prepare agent for game - resetting certain variables used in the agent
agent.init(obs, infos)
commands, last_facts, init_facts = [], [], []
commands_per_step, game_facts_cache = [], []
for i in range(batch_size):
commands.append("restart")
last_facts.append(None)
init_facts.append(None)
game_facts_cache.append([])
commands_per_step.append(["restart"])
# process the observation string and get the possible words
observation_strings, possible_words = agent.get_game_info_at_certain_step(obs, infos)
# print(possible_words)
# add the command joined by special character in the observation string to be processed and used by agent.
# the reason for the array is because each game in the batch has different observation strings and commands.
observation_strings = [a + " <|> " + item for a, item in zip(commands, observation_strings)]
# process questions and answers for use in model
input_quest, input_quest_char, _ = agent.get_agent_inputs(questions)
transition_cache = []
print_cmds = []
counting_rewards_np = []
valid_command_rewards_np = []
# This tells the agent to act randomly until episode number exceeds the start to learn threshold.
# noisy nets are used in rainbow algorithm which decide on the randomness of actions so this must
# be false if noisy nets are used.
act_randomly = True
# push init state into counting reward dict - this is used for exploration bonus...?
state_strings = agent.get_state_strings(infos)
_ = agent.get_binarized_count(state_strings, update=True)
states = []
### The actual game step loop where the agent performs actions
for step_no in range(agent.max_nb_steps_per_episode):
# update answerer input
for i in range(batch_size):
if agent.not_finished_yet[i] == 1:
# push observation strings into agents history - with current config settings the history size will always be 1 so not reall important.
# This essentially allows us to fine tune how much context and history the agent has access to at each step
agent.naozi.push_one(i, copy.copy(observation_strings[i]))
if agent.prev_step_is_still_interacting[i] == 1:
new_facts = process_facts(last_facts[i], infos["game"][i], infos["facts"][i], infos["last_action"][i], commands[i])
game_facts_cache[i].append(new_facts) # info used in reward computing of existence question
last_facts[i] = new_facts
if step_no == 0:
init_facts[i] = copy.copy(new_facts)
# generate commands
if agent.noisy_net:
agent.reset_noise() # Draw a new set of noisy weights
# This will always be the current observation string since we set history size to one
observation_strings_w_history = agent.naozi.get()
states.append(observation_strings_w_history[0])
# This gets the observations into their word id forms
input_observation, input_observation_char, _ = agent.get_agent_inputs(observation_strings_w_history)
# states.append(input_observation)
# We feed these observations along with the question to the agent to produce an action
commands, replay_info = agent.act(obs, infos, input_observation, input_observation_char, input_quest, input_quest_char, possible_words, random=act_randomly)
# We append each game in the batches commands to a 2d array
for i in range(batch_size):
commands_per_step[i].append(commands[i])
replay_info = [observation_strings_w_history, questions, possible_words] + replay_info
# get the real admissible commands from environment i.e the true valid commands for the game step
admissible_commands = [set(item) - set(["look", "wait", "inventory"]) for item in infos["admissible_commands"]]
# get valid command reward
# i.e for the commands generated by the agent, are they valid if so give set vc reward to 1 otherwise zero
vc_rewards = [float(c in ac) for c, ac in zip(commands, admissible_commands)]
valid_command_rewards_np.append(np.array(vc_rewards))
# pass commands into env
obs, _, _, infos = env.step(commands)
# possible words do not depend on history, because one can only interact with what is currently accessible
observation_strings, possible_words = agent.get_game_info_at_certain_step(obs, infos)
# add action performed to get into state to observation strings
observation_strings = [a + " <|> " + item for a, item in zip(commands, observation_strings)]
# counting rewards - whenever a new state is visited a reward of one is given
state_strings = agent.get_state_strings(infos)
c_rewards = agent.get_binarized_count(state_strings, update=True)
counting_rewards_np.append(np.array(c_rewards))
# Rainbow algorithm - resetting noisy nets
if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0:
agent.reset_noise() # Draw a new set of noisy weights
# If episode number reacheas the start to learn threshold - start optimizing weights
if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0:
# Update agent weights
interaction_loss = agent.update_interaction()
# Log loss
if interaction_loss is not None:
running_avg_correct_state_loss.push(interaction_loss)
# update question answerer
qa_loss = agent.update_qa()
# Log loss
if qa_loss is not None:
running_avg_qa_loss.push(qa_loss)
print_cmds.append(commands[0] if agent.prev_step_is_still_interacting[0] else "--")
# force stopping
if step_no == agent.max_nb_steps_per_episode - 1:
replay_info[-1] = torch.zeros_like(replay_info[-1])
transition_cache.append(replay_info)
step_in_total += 1
if (step_no == agent.max_nb_steps_per_episode - 1 ) or (step_no > 0 and np.sum(generic.to_np(replay_info[-1])) == 0):
break
### End of action peforming loop - Now performs Question Answering
# The agent has exhausted all steps, now answer question.
# Get most recent observation
answerer_input = agent.naozi.get()
answerer_input_observation, answerer_input_observation_char, answerer_observation_ids = agent.get_agent_inputs(answerer_input)
chosen_word_indices = agent.answer_question_act_greedy(answerer_input_observation, answerer_input_observation_char, answerer_observation_ids, input_quest, input_quest_char) # batch
chosen_word_indices_np = generic.to_np(chosen_word_indices)
chosen_answers = [agent.word_vocab[item] for item in chosen_word_indices_np]
# rewards
# qa reward
qa_reward_np = reward_helper.get_qa_reward(answers, chosen_answers)
# sufficient info rewards
masks = [item[-1] for item in transition_cache]
masks_np = [generic.to_np(item) for item in masks]
# 1 1 0 0 0 --> 1 1 0 0 0 0
game_finishing_mask = np.stack(masks_np + [np.zeros((batch_size,))], 0) # game step+1 x batch size
# 1 1 0 0 0 0 --> 0 1 0 0 0
game_finishing_mask = game_finishing_mask[:-1, :] - game_finishing_mask[1:, :] # game step x batch size
game_running_mask = np.stack(masks_np, 0) # game step x batch size
if agent.question_type == "location":
# sufficient info reward: location question
reward_helper_info["observation_before_finish"] = answerer_input
reward_helper_info["game_finishing_mask"] = game_finishing_mask
sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_location(reward_helper_info)
elif agent.question_type == "existence":
# sufficient info reward: existence question
reward_helper_info["observation_before_finish"] = answerer_input
reward_helper_info["game_facts_per_step"] = game_facts_cache # facts before issuing command (we want to stop at correct state)
reward_helper_info["init_game_facts"] = init_facts
reward_helper_info["full_facts"] = infos["facts"]
reward_helper_info["answers"] = answers
reward_helper_info["game_finishing_mask"] = game_finishing_mask
sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_existence(reward_helper_info)
elif agent.question_type == "attribute":
# sufficient info reward: attribute question
reward_helper_info["answers"] = answers
reward_helper_info["game_facts_per_step"] = game_facts_cache # facts before and after issuing commands (we want to compare the differnce)
reward_helper_info["init_game_facts"] = init_facts
reward_helper_info["full_facts"] = infos["facts"]
reward_helper_info["commands_per_step"] = commands_per_step # commands before and after issuing commands (we want to compare the differnce)
reward_helper_info["game_finishing_mask"] = game_finishing_mask
sufficient_info_reward_np = reward_helper.get_sufficient_info_reward_attribute(reward_helper_info)
else:
raise NotImplementedError
trajectory = None
# push qa experience into qa replay buffer
for b in range(batch_size): # data points in batch
# if the agent is not in the correct state, do not push it into replay buffer
# if sufficient information score is LESS than some threshold value - do not
# record the trajectory as a part of the training.
if np.sum(sufficient_info_reward_np[b]) < variant["sufficient_information_threshold"]:
continue
trajectory = {
"episode_no" : episode_no,
"steps" : [],
"question" : questions[0],
"answer" : answers[0],
"mask" : list(game_finishing_mask[:,0]),
"entity" : reward_helper_info["_entities"][0],
"attribute" : None if not reward_helper_info["_attributes"] else reward_helper_info["_attributes"][0],
}
agent.qa_replay_memory.push(False, qa_reward_np[b], answerer_input[b], questions[b], answers[b])
for step,state in enumerate(states[:-1]):
# Batch size of 1 so commands_per_step only has a single list inisde another list.
split_commands = commands_per_step[0][step+1].split()
act, mod, obj = "", "[PAD]" , "[PAD]"
if len(split_commands) == 3:
act, mod, obj = split_commands
elif len(split_commands) == 2:
act,obj,mod = *split_commands, "[PAD]"
elif len(split_commands) == 1:
act = split_commands[0]
trajectory["steps"].append({"step": step,"state" : state , "command" : {"action" : act, "modifier" : mod, "object" : obj}})
# assign sufficient info reward and counting reward to the corresponding steps
counting_rewards_np = np.stack(counting_rewards_np, 1) # batch x game step
valid_command_rewards_np = np.stack(valid_command_rewards_np, 1) # batch x game step
command_rewards_np = sufficient_info_reward_np + counting_rewards_np * game_running_mask.T * agent.revisit_counting_lambda + valid_command_rewards_np * game_running_mask.T * agent.valid_command_bonus_lambda # batch x game step
command_rewards = generic.to_pt(command_rewards_np, enable_cuda=agent.use_cuda, type="float") # batch x game step
for i in range(command_rewards_np.shape[1]):
transition_cache[i].append(command_rewards[:, i])
# Recording reward if trajectory exists - i.e if sufficient info reward is > 0
if trajectory:
cum_reward = 0
for step,state in enumerate(states[:-1]):
reward = round(command_rewards_np[0][step],5)
trajectory["steps"][step]["reward"] = reward
cum_reward += reward
trajectory["total_reward"] = cum_reward
print(json.dumps(trajectory),file=offline_rl_data)
# push command generation experience into replay buffer
for b in range(batch_size):
is_prior = np.sum(command_rewards_np[b], 0) > 0.0
for i in range(len(transition_cache)):
batch_observation_strings, batch_question_strings, batch_possible_words, batch_chosen_indices, _, batch_rewards = transition_cache[i]
is_final = True
if masks_np[i][b] != 0:
is_final = False
agent.command_generation_replay_memory.push(is_prior, batch_observation_strings[b], batch_question_strings[b], [item[b] for item in batch_possible_words], [item[b] for item in batch_chosen_indices], batch_rewards[b], is_final)
if masks_np[i][b] == 0.0:
break
# for printing
r_qa = np.mean(qa_reward_np)
r_sufficient_info = np.mean(np.sum(sufficient_info_reward_np, -1))
running_avg_qa_reward.push(r_qa)
running_avg_sufficient_info_reward.push(r_sufficient_info)
print_rewards = np.mean(np.sum(command_rewards_np, -1))
obs_string = answerer_input[0]
# print(obs_string)
# finish game
agent.finish_of_episode(episode_no, batch_size)
# close env
env.close()
if agent.train_data_size == -1:
# when games are generated on the fly,
# remove all files (including .json and .ni) that have been used
files_to_delete = []
for gamefile in can_delete_these:
if not gamefile.endswith(".ulx"):
continue
files_to_delete.append(gamefile)
files_to_delete.append(gamefile.replace(".ulx", ".json"))
files_to_delete.append(gamefile.replace(".ulx", ".ni"))
os.system("rm -f {}".format(" ".join(files_to_delete)))
episode_no += batch_size
time_2 = datetime.datetime.now()
if episode_no < agent.learn_start_from_this_episode:
continue
if episode_no == 0 or (episode_no % agent.save_frequency > (episode_no - batch_size) % agent.save_frequency):
continue
eval_qa_reward, eval_sufficient_info_reward = 0.0, 0.0
# evaluate
if agent.run_eval:
eval_qa_reward, eval_sufficient_info_reward = evaluate.evaluate(data_dir, agent)
# if run eval, then save model by eval accucacy
if eval_qa_reward + eval_sufficient_info_reward > best_sum_reward_so_far:
best_sum_reward_so_far = eval_qa_reward + eval_sufficient_info_reward
agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
# save model
elif agent.save_checkpoint:
if running_avg_qa_reward.get_avg() + running_avg_sufficient_info_reward.get_avg() > best_sum_reward_so_far:
best_sum_reward_so_far = running_avg_qa_reward.get_avg() + running_avg_sufficient_info_reward.get_avg()
agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt")
# write accucacies down into file
_s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0],
"sufficient info": running_avg_sufficient_info_reward.get_avg(),
"qa": running_avg_qa_reward.get_avg(),
"eval sufficient info": eval_sufficient_info_reward,
"eval qa": eval_qa_reward})
with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile:
outfile.write(_s + '\n')
outfile.flush()
# Writes current episodes data to file.
offline_rl_data.flush()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train an agent.")
parser.add_argument("--data_path","-path",
default="./",
help="where the data (games) are.")
parser.add_argument("--environment","-env",
default="random_rollouts",
help="The type of offline RL being recorded.")
parser.add_argument("--data_out_path","-out",
default="./decision_transformer/data",
help="Output path of recorded trajectories.")
parser.add_argument("--append","-a",
type=bool,
default=True)
parser.add_argument("--sufficient_information_threshold","-sui",
type=float,
default=1.0)
args = parser.parse_args()
train(variant=vars(args))