Skip to content

Commit 108f6e4

Browse files
committed
finish p2
1 parent 05f63ef commit 108f6e4

36 files changed

+3181
-11
lines changed

udacity_deep_reinforcement_learning/11_ddpg/ddpg-bipedal/DDPG.ipynb

Lines changed: 212 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import numpy as np
2+
import random
3+
import copy
4+
from collections import namedtuple, deque
5+
6+
from model import Actor, Critic
7+
8+
import torch
9+
import torch.nn.functional as F
10+
import torch.optim as optim
11+
12+
BUFFER_SIZE = int(1e6) # replay buffer size
13+
BATCH_SIZE = 128 # minibatch size
14+
GAMMA = 0.99 # discount factor
15+
TAU = 1e-3 # for soft update of target parameters
16+
LR_ACTOR = 1e-4 # learning rate of the actor
17+
LR_CRITIC = 3e-4 # learning rate of the critic
18+
WEIGHT_DECAY = 0.0001 # L2 weight decay
19+
20+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21+
22+
class Agent():
23+
"""Interacts with and learns from the environment."""
24+
25+
def __init__(self, state_size, action_size, random_seed):
26+
"""Initialize an Agent object.
27+
28+
Params
29+
======
30+
state_size (int): dimension of each state
31+
action_size (int): dimension of each action
32+
random_seed (int): random seed
33+
"""
34+
self.state_size = state_size
35+
self.action_size = action_size
36+
self.seed = random.seed(random_seed)
37+
38+
# Actor Network (w/ Target Network)
39+
self.actor_local = Actor(state_size, action_size, random_seed).to(device)
40+
self.actor_target = Actor(state_size, action_size, random_seed).to(device)
41+
self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
42+
43+
# Critic Network (w/ Target Network)
44+
self.critic_local = Critic(state_size, action_size, random_seed).to(device)
45+
self.critic_target = Critic(state_size, action_size, random_seed).to(device)
46+
self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
47+
48+
# Noise process
49+
self.noise = OUNoise(action_size, random_seed)
50+
51+
# Replay memory
52+
self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
53+
54+
def step(self, state, action, reward, next_state, done):
55+
"""Save experience in replay memory, and use random sample from buffer to learn."""
56+
# Save experience / reward
57+
self.memory.add(state, action, reward, next_state, done)
58+
59+
# Learn, if enough samples are available in memory
60+
if len(self.memory) > BATCH_SIZE:
61+
experiences = self.memory.sample()
62+
self.learn(experiences, GAMMA)
63+
64+
def act(self, state, add_noise=True):
65+
"""Returns actions for given state as per current policy."""
66+
state = torch.from_numpy(state).float().to(device)
67+
self.actor_local.eval()
68+
with torch.no_grad():
69+
action = self.actor_local(state).cpu().data.numpy()
70+
self.actor_local.train()
71+
if add_noise:
72+
action += self.noise.sample()
73+
return np.clip(action, -1, 1)
74+
75+
def reset(self):
76+
self.noise.reset()
77+
78+
def learn(self, experiences, gamma):
79+
"""Update policy and value parameters using given batch of experience tuples.
80+
Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
81+
where:
82+
actor_target(state) -> action
83+
critic_target(state, action) -> Q-value
84+
85+
Params
86+
======
87+
experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
88+
gamma (float): discount factor
89+
"""
90+
states, actions, rewards, next_states, dones = experiences
91+
92+
# ---------------------------- update critic ---------------------------- #
93+
# Get predicted next-state actions and Q values from target models
94+
actions_next = self.actor_target(next_states)
95+
Q_targets_next = self.critic_target(next_states, actions_next)
96+
# Compute Q targets for current states (y_i)
97+
Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
98+
# Compute critic loss
99+
Q_expected = self.critic_local(states, actions)
100+
critic_loss = F.mse_loss(Q_expected, Q_targets)
101+
# Minimize the loss
102+
self.critic_optimizer.zero_grad()
103+
critic_loss.backward()
104+
self.critic_optimizer.step()
105+
106+
# ---------------------------- update actor ---------------------------- #
107+
# Compute actor loss
108+
actions_pred = self.actor_local(states)
109+
actor_loss = -self.critic_local(states, actions_pred).mean()
110+
# Minimize the loss
111+
self.actor_optimizer.zero_grad()
112+
actor_loss.backward()
113+
self.actor_optimizer.step()
114+
115+
# ----------------------- update target networks ----------------------- #
116+
self.soft_update(self.critic_local, self.critic_target, TAU)
117+
self.soft_update(self.actor_local, self.actor_target, TAU)
118+
119+
def soft_update(self, local_model, target_model, tau):
120+
"""Soft update model parameters.
121+
θ_target = τ*θ_local + (1 - τ)*θ_target
122+
123+
Params
124+
======
125+
local_model: PyTorch model (weights will be copied from)
126+
target_model: PyTorch model (weights will be copied to)
127+
tau (float): interpolation parameter
128+
"""
129+
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
130+
target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
131+
132+
class OUNoise:
133+
"""Ornstein-Uhlenbeck process."""
134+
135+
def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
136+
"""Initialize parameters and noise process."""
137+
self.mu = mu * np.ones(size)
138+
self.theta = theta
139+
self.sigma = sigma
140+
self.seed = random.seed(seed)
141+
self.reset()
142+
143+
def reset(self):
144+
"""Reset the internal state (= noise) to mean (mu)."""
145+
self.state = copy.copy(self.mu)
146+
147+
def sample(self):
148+
"""Update internal state and return it as a noise sample."""
149+
x = self.state
150+
dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
151+
self.state = x + dx
152+
return self.state
153+
154+
class ReplayBuffer:
155+
"""Fixed-size buffer to store experience tuples."""
156+
157+
def __init__(self, action_size, buffer_size, batch_size, seed):
158+
"""Initialize a ReplayBuffer object.
159+
Params
160+
======
161+
buffer_size (int): maximum size of buffer
162+
batch_size (int): size of each training batch
163+
"""
164+
self.action_size = action_size
165+
self.memory = deque(maxlen=buffer_size) # internal memory (deque)
166+
self.batch_size = batch_size
167+
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
168+
self.seed = random.seed(seed)
169+
170+
def add(self, state, action, reward, next_state, done):
171+
"""Add a new experience to memory."""
172+
e = self.experience(state, action, reward, next_state, done)
173+
self.memory.append(e)
174+
175+
def sample(self):
176+
"""Randomly sample a batch of experiences from memory."""
177+
experiences = random.sample(self.memory, k=self.batch_size)
178+
179+
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
180+
actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
181+
rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
182+
next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
183+
dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
184+
185+
return (states, actions, rewards, next_states, dones)
186+
187+
def __len__(self):
188+
"""Return the current size of internal memory."""
189+
return len(self.memory)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import numpy as np
2+
3+
import torch
4+
import torch.nn as nn
5+
import torch.nn.functional as F
6+
7+
def hidden_init(layer):
8+
fan_in = layer.weight.data.size()[0]
9+
lim = 1. / np.sqrt(fan_in)
10+
return (-lim, lim)
11+
12+
class Actor(nn.Module):
13+
"""Actor (Policy) Model."""
14+
15+
def __init__(self, state_size, action_size, seed, fc_units=256):
16+
"""Initialize parameters and build model.
17+
Params
18+
======
19+
state_size (int): Dimension of each state
20+
action_size (int): Dimension of each action
21+
seed (int): Random seed
22+
fc1_units (int): Number of nodes in first hidden layer
23+
fc2_units (int): Number of nodes in second hidden layer
24+
"""
25+
super(Actor, self).__init__()
26+
self.seed = torch.manual_seed(seed)
27+
self.fc1 = nn.Linear(state_size, fc_units)
28+
self.fc2 = nn.Linear(fc_units, action_size)
29+
self.reset_parameters()
30+
31+
def reset_parameters(self):
32+
self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
33+
self.fc2.weight.data.uniform_(-3e-3, 3e-3)
34+
35+
def forward(self, state):
36+
"""Build an actor (policy) network that maps states -> actions."""
37+
x = F.relu(self.fc1(state))
38+
return F.tanh(self.fc2(x))
39+
40+
41+
class Critic(nn.Module):
42+
"""Critic (Value) Model."""
43+
44+
def __init__(self, state_size, action_size, seed, fcs1_units=256, fc2_units=256, fc3_units=128):
45+
"""Initialize parameters and build model.
46+
Params
47+
======
48+
state_size (int): Dimension of each state
49+
action_size (int): Dimension of each action
50+
seed (int): Random seed
51+
fcs1_units (int): Number of nodes in the first hidden layer
52+
fc2_units (int): Number of nodes in the second hidden layer
53+
"""
54+
super(Critic, self).__init__()
55+
self.seed = torch.manual_seed(seed)
56+
self.fcs1 = nn.Linear(state_size, fcs1_units)
57+
self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
58+
self.fc3 = nn.Linear(fc2_units, fc3_units)
59+
self.fc4 = nn.Linear(fc3_units, 1)
60+
self.reset_parameters()
61+
62+
def reset_parameters(self):
63+
self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
64+
self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
65+
self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
66+
self.fc4.weight.data.uniform_(-3e-3, 3e-3)
67+
68+
def forward(self, state, action):
69+
"""Build a critic (value) network that maps (state, action) pairs -> Q-values."""
70+
xs = F.leaky_relu(self.fcs1(state))
71+
x = torch.cat((xs, action), dim=1)
72+
x = F.leaky_relu(self.fc2(x))
73+
x = F.leaky_relu(self.fc3(x))
74+
return self.fc4(x)

udacity_deep_reinforcement_learning/11_ddpg/ddpg-pendulum/DDPG.ipynb

Lines changed: 196 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[//]: # (Image References)
2+
3+
[image1]: https://user-images.githubusercontent.com/10624937/42135610-c37e0292-7d12-11e8-8228-4d3585f8c026.gif "Trained Agent"
4+
5+
# Actor-Critic Methods
6+
7+
### Instructions
8+
9+
Open `DDPG.ipynb` to see an implementation of DDPG with OpenAI Gym's Pendulum environment.
10+
11+
### Results
12+
13+
![Trained Agent][image1]
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)