-
Notifications
You must be signed in to change notification settings - Fork 0
/
flagrun.py
199 lines (154 loc) · 7.74 KB
/
flagrun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import time
from typing import Tuple, List
import gym
import numpy as np
# noinspection PyUnresolvedReferences
import hrl_pybullet_envs
# noinspection PyUnresolvedReferences
import pybullet_envs
import torch
from hrl_pybullet_envs import AntFlagrunBulletEnv
from mpi4py import MPI
from torch import Tensor, clamp, cat, nn
import src.core.es as es
from src.core.noisetable import NoiseTable
from src.core.policy import Policy
from src.gym.training_result import TrainingResult, RewardResult
from src.nn.nn import BaseNet, FeedForward
from src.nn.optimizers import Adam
from src.utils import utils
from src.utils.rankers import CenteredRanker
from src.utils.reporters import DefaultMpiReporterSet, StdoutReporter, LoggerReporter
def unit_vec(v: np.ndarray):
return v / np.linalg.norm(v)
def gen_goal(rs, mn=0):
# mn: 0 for goal to be to the front or sides of agent, -1 for allowing goal pos to be behind agent
g = (rs.uniform(mn, 1), rs.uniform(mn, 1))
while np.linalg.norm(g) < 0.25:
g = (rs.uniform(mn, 1), rs.uniform(mn, 1))
return g
class PrimFF(BaseNet):
def __init__(self, layer_sizes: List[int], activation: nn.Module, obs_shape, ac_std: float, ob_clip=5):
layers = []
for in_size, out_size in zip(layer_sizes[:-1], layer_sizes[1:]):
layers += [nn.Linear(in_size, out_size), activation]
super().__init__(layers, obs_shape, ob_clip)
self._action_std = ac_std
GOAL = 'goal'
def forward(self, inp: Tensor, **kwargs) -> Tensor:
rs = kwargs['rs']
goal = kwargs[PrimFF.GOAL]
inp = clamp((inp - self._obmean) / self._obstd, min=-self.ob_clip, max=self.ob_clip)
a = self.model(cat((goal, inp)).float()) # adding goal in after clip and vbn
if self._action_std != 0 and rs is not None:
a += rs.randn(*a.shape) * self._action_std
return a
def get_angular_reward(e: gym.Env, pos: np.ndarray, goal_pos: np.ndarray) -> float:
yaw = e.robot_body.pose().rpy()[2]
x, y, z = e.robot_body.pose().xyz()
m = np.tan(yaw)
c = y - m * x
forward = unit_vec(np.array([1, m + c]) - pos[:2])
rob_to_goal = unit_vec(goal_pos - pos[:2])
angle_rew = np.dot(rob_to_goal, forward)
# debug for angle reward
e.stadium_scene._p.addUserDebugLine([0, 0, 0], forward.tolist() + [0], lifeTime=0.1,
lineColorRGB=[1, 0, 0])
e.stadium_scene._p.addUserDebugLine([0, 0, 0], rob_to_goal.tolist() + [0], lifeTime=0.1,
lineColorRGB=[0, 0, 1])
return angle_rew
def run_model(model: PrimFF,
env: gym.Env,
max_steps: int,
episodes=1,
rs: np.random.RandomState = None,
render: bool = False) -> \
Tuple[List[float], List[float], np.ndarray, int]:
"""
Evaluates model on the provided env
:returns: tuple of rewards earned and positions at each timestep position list is always of length `max_steps`
"""
behv = []
rews = []
obs = []
dist_rew = 0
# goal_pos = goal_normed.numpy() * 7
# env.walk_target_x, env.walk_target_y = goal_pos
# env.robot.walk_target_x, env.robot.walk_target_y = goal_pos
#
# sq_dist = np.linalg.norm(goal_pos) ** 2
if render:
env.render('human')
with torch.no_grad():
for ep in range(episodes):
ob = env.reset()
# goal_normed = torch.tensor([env.walk_target_x, env.walk_target_y]) / env.size
# old_dist = -np.linalg.norm(env.unwrapped.parts['torso'].get_position()[:2] - goal_pos)
for step in range(max_steps):
ob = torch.from_numpy(ob).float()
action = model(ob, rs=rs)
ob, env_rew, done, i = env.step(action.numpy())
# if 'target' in i:
# goal_normed = torch.tensor([*i['target']]) / env.size
pos = env.unwrapped.robot.body_real_xyz[:2]
rews += [env_rew]
obs.append(ob)
behv.extend(pos)
if render:
env.render('human')
# time.sleep(1 / 100)
# env.stadium_scene._p.addUserDebugLine([*pos, 0.5], [*(pos + ob[:2]), 0.5], lifeTime=0.1)
# robot to goal
env.stadium_scene._p.addUserDebugLine([*pos, 0.5], [env.walk_target_x, env.walk_target_y, 0.5],
lifeTime=0.1)
# robot dir
# point = [10, m * 10 + c, pos[2]]
# env.stadium_scene._p.addUserDebugLine([x, y, pos[2]], point, lifeTime=0.1, lineColorRGB=[0, 1, 0])
if done:
break
# rews += [-((pos[0] - goal_pos[0]) ** 2 + (pos[1] - goal_pos[1]) ** 2)]
rew = sum(rews) / episodes
behv += behv[-3:] * (max_steps - int(len(behv) / 3)) # extending the behaviour vector to have `max_steps` elements
return [rew], behv, np.array(obs), step
if __name__ == '__main__':
comm: MPI.Comm = MPI.COMM_WORLD
cfg_file = utils.parse_args()
cfg = utils.load_config(cfg_file)
run_name = f'{cfg.env.name}-{cfg.general.name}'
reporter = DefaultMpiReporterSet(comm, run_name, StdoutReporter(comm), LoggerReporter(comm, run_name))
reporter.print(str(cfg))
env: gym.Env = gym.make(cfg.env.name, **cfg.env.kwargs)
AntFlagrunBulletEnv.ant_env_rew_weight = cfg.env.ant_env_rew_weight
AntFlagrunBulletEnv.path_rew_weight = cfg.env.path_rew_weight
AntFlagrunBulletEnv.dist_rew_weight = cfg.env.dist_rew_weight
AntFlagrunBulletEnv.goal_reach_rew = cfg.env.goal_reach_rew
# seeding; this must be done before creating the neural network so that params are deterministic across processes
rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
all_seeds = comm.alltoall([my_seed] * comm.size) # simply for saving/viewing the seeds used on each proc
print(f'seeds:{all_seeds}')
# initializing obstat, policy, optimizer, noise and ranker
in_size = int(np.prod(env.observation_space.shape))
out_size = int(np.prod(env.action_space.shape))
nn = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)
policy: Policy = Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr))
nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), None, cfg.general.seed)
ranker = CenteredRanker()
def r_fn(model: PrimFF, use_ac_noise=True) -> TrainingResult:
save_obs = (rs.random() if rs is not None else np.random.random()) < cfg.policy.save_obs_chance
rews, behv, obs, steps = run_model(model, env, cfg.env.max_steps, cfg.general.eps_per_policy,
rs if use_ac_noise else None, False)
return RewardResult(rews, behv, obs if save_obs else np.array([np.zeros(env.observation_space.shape)]), steps)
assert cfg.general.policies_per_gen % comm.size == 0 and (cfg.general.policies_per_gen / comm.size) % 2 == 0
eps_per_proc = int((cfg.general.policies_per_gen / comm.size) / 2)
for gen in range(cfg.general.gens): # main loop
reporter.start_gen()
# goal = torch.tensor(comm.scatter([gen_goal(rs)] * comm.size if comm.rank == 0 else None))
tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker, reporter)
policy.update_obstat(gen_obstat)
reporter.end_gen()
# final_pos = np.array(tr.behaviour[-3:-1])
# gp = goal.numpy() * 7
# dist = np.linalg.norm(final_pos - gp)
# reporter.log({'dist from goal': dist})
if gen % 10 == 0 and comm.rank == 0: # save policy every 10 generations
policy.save(f'saved/{run_name}/weights/', str(gen))