-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicy_network.py
322 lines (255 loc) · 10 KB
/
policy_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import torch
from torch import nn
from torch.nn import functional as F
from dqn import DQN
class PolicyNetwork(DQN):
"""A policy network
Args:
n_state: number of state
n_action: number of action
n_hidden: number of hidden units
lr: learning rate
"""
def __init__(self, n_state, n_action, n_hidden=50, lr=0.001):
self.model = nn.Sequential(
nn.Linear(n_state, n_hidden),
nn.ReLU(),
nn.Linear(n_hidden, n_action),
nn.Softmax()
).to(self.device)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def predict(self, state):
"""Predict the action probabilities using the model
Args:
state: input state
Returns:
predicted policy
"""
return self.model(torch.Tensor(state).to(self.device))
def update(self, returns, log_probs):
"""Update the weights of the policy network given the training examples
Args:
returns: if without baseline, it's cumulative rewards for each step in an episode; if with baseline, it's advantages for each step in an episode.
log_probs: log probability for each step
"""
policy_gradient = []
for log_prob, Gt in zip(log_probs, returns):
# log_prob is on cuda device
# Gt is on cpu device
# but they can multiply
# because they are both 0-dimension tensor
# cpu\cuda 0-D n-D
# 0-D cuda cuda
# n-D error error
# conclusion: if cpu tensor is a 0-D tensor, it can multiply with a cuda tensor, no matter what dimension the cuda tensor has
policy_gradient.append(-log_prob * Gt.to(self.device))
loss = torch.stack(policy_gradient).sum()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def get_action(self, state):
"""Estimate the policy and sample an action, compute its log probability
Args:
state: input state
Returns:
the selected action (int) and its log probability (tensor)
"""
probs = self.predict(state)
action = torch.multinomial(probs, 1).item()
log_prob = torch.log(probs[action])
return action, log_prob
class ValueNetwork(DQN):
"""Fully-connected network to calculate the value of the state
Args:
n_state: number of state
n_hidden: number of hidden units
lr: learning rate
"""
def __init__(self, n_state, n_hidden=50, lr=0.05):
self.loss_func = nn.MSELoss()
self.model = torch.nn.Sequential(
nn.Linear(n_state, n_hidden),
nn.ReLU(),
nn.Linear(n_hidden, 1)
).to(self.device)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def update(self, state, value):
"""Update the model with state and target value
Args:
state: the input state
value: the target value
"""
y_pred = self.model(torch.Tensor(state).to(self.device))
loss = self.loss_func(y_pred, torch.Tensor(value).to(self.device).unsqueeze(1))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def predict(self, state):
"""Predict the value under certain state
Args:
state: the input state
Returns:
the value of the state
"""
with torch.no_grad():
return self.model(torch.Tensor(state).to(self.device)).cpu()
class ActorCriticModel(nn.Module):
"""Actor-critic algorithm model
Args:
n_input: number of input
n_output: number of output
n_hidden: number of hidden units, either an int, or List[int]
Returns:
action probabilities under the state, values of the state
"""
def __init__(self, n_input, n_output, n_hidden):
super().__init__()
if isinstance(n_hidden, int):
n_hidden = [n_hidden]
fc_list = [nn.Linear(n_input, n_hidden[0]), nn.ReLU()]
if len(n_hidden) > 1:
for i in range(len(n_hidden)-1):
fc_list.append(nn.Linear(n_hidden[i], n_hidden[i+1]))
fc_list.append(nn.ReLU())
self.backbone = nn.Sequential(*fc_list)
self.action = nn.Linear(n_hidden[-1], n_output)
self.value = nn.Linear(n_hidden[-1], 1)
def forward(self, x):
x = self.backbone(x)
action_probs = F.softmax(self.action(x), dim=-1)
state_values = self.value(x)
return action_probs, state_values
class ActorCriticPolicyNetwork(PolicyNetwork):
"""Policy Network with Actor-Critic algorithm
Args:
n_state: length of state
n_action: number of action
n_hidden: number of hidden units
lr: learning rate
"""
def __init__(self, n_state, n_action, n_hidden=50, lr=0.001):
self.model = ActorCriticModel(n_state, n_action, n_hidden).to(self.device)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.9)
def predict(self, state):
"""Predict the action probabilities of action under the input state
Args:
state: the input state
Returns:
the action probabilities, state-value
"""
return self.model(torch.Tensor(state).to(self.device))
def update(self, returns, log_probs, state_values):
"""Update the weights of the Actor Critic network given the training samples
Args:
returns: return (cumulative rewards) for each step in an episode
log_probs: log probability for each step
state_values: state-value for each step
"""
loss = 0
returns = returns.to(self.device).view(-1, 1)
for log_prob, value, Gt in zip(log_probs, state_values, returns):
advantage = Gt - value.item()
policy_loss = -log_prob * advantage
value_loss = F.smooth_l1_loss(value, Gt)
loss += policy_loss + value_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def get_action(self, state):
"""Estimate the policy and sample an action, compute its log probability
Args:
state: input state
Returns:
the selected action, its log probability, and its state-value
"""
action_probs, state_value = self.predict(state)
action = torch.multinomial(action_probs, 1).item()
log_prob = torch.log(action_probs[action])
return action, log_prob, state_value
class ActorCriticGuassianModel(nn.Module):
"""Using a Guassian Model to simulate a Gaussian distribution and calculate the state-value
Args:
n_input: number of input
n_output: number of output
n_hidden: number of hidden units
"""
def __init__(self, n_input, n_output, n_hidden):
super().__init__()
self.fc = nn.Linear(n_input, n_hidden)
self.mu = nn.Linear(n_hidden, n_output)
self.sigma = nn.Linear(n_hidden, n_output)
self.value = nn.Linear(n_hidden, 1)
self.distribution = torch.distributions.normal.Normal
def forward(self, x):
x = F.relu(self.fc(x))
mu = 2 * torch.tanh(self.mu(x))
sigma = F.softplus(self.sigma(x)) + 1e-5
dist = self.distribution(mu.view(1, ).detach(), sigma.view(1, ).detach())
value = self.value(x)
return dist, value
class ActorCriticGaussianPolicyNetwork(ActorCriticPolicyNetwork):
"""Actor-critic algorithm using a Gaussian estimation network
Args:
n_state: number of state
n_action: number of action
n_hidden: number of hidden units
lr: learning rate
"""
def __init__(self, n_state, n_action, n_hidden=50, lr=0.001):
self.model = ActorCriticGuassianModel(n_state, n_action, n_hidden).to(self.device)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def predict(self, state):
"""Compute the distribution and state-value based on the input state
Args:
state: the input state
Returns:
dist (Normal object), value
"""
#self.model.training = False
result = self.model(torch.Tensor(state).to(self.device))
#self.model.training = True
return result
def get_action(self, state):
"""Compute the action based on the state
Args:
state: the input state
Returns:
action, its log probability, and its estimated state-value
"""
dist, value = self.predict(state)
action = dist.sample().cpu().numpy()
log_prob = dist.log_prob(action[0])
return action, log_prob, value
class Estimator(PolicyNetwork):
"""Estimator network that can predict the action directly from the state
Args:
n_state: number of state
lr: learning rate
"""
def __init__(self, n_state, lr=0.0001):
self.model = nn.Sequential(
nn.Linear(n_state, 1),
nn.Sigmoid()
).to(self.device)
self.loss_func = nn.BCELoss()
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def predict(self, state):
"""Predict the action using the input state
Args:
state: the input state
Returns:
the action
"""
return self.model(torch.Tensor(state).to(self.device))
def update(self, state, target):
"""Update the model using the state and the target
Args:
state: the input state
target: the target action
"""
y_pred = self.predict(state)
loss = self.loss_func(y_pred, torch.Tensor(target).to(self.device).view(-1, 1))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()