1
+ import numpy as np
2
+ import random
3
+ import copy
4
+ from collections import namedtuple , deque
5
+
6
+ from model import Actor , Critic
7
+
8
+ import torch
9
+ import torch .nn .functional as F
10
+ import torch .optim as optim
11
+
12
+ BUFFER_SIZE = int (1e6 ) # replay buffer size
13
+ BATCH_SIZE = 128 # minibatch size
14
+ GAMMA = 0.99 # discount factor
15
+ TAU = 1e-3 # for soft update of target parameters
16
+ LR_ACTOR = 1e-4 # learning rate of the actor
17
+ LR_CRITIC = 3e-4 # learning rate of the critic
18
+ WEIGHT_DECAY = 0.0001 # L2 weight decay
19
+
20
+ device = torch .device ("cuda:0" if torch .cuda .is_available () else "cpu" )
21
+
22
+ class Agent ():
23
+ """Interacts with and learns from the environment."""
24
+
25
+ def __init__ (self , state_size , action_size , random_seed ):
26
+ """Initialize an Agent object.
27
+
28
+ Params
29
+ ======
30
+ state_size (int): dimension of each state
31
+ action_size (int): dimension of each action
32
+ random_seed (int): random seed
33
+ """
34
+ self .state_size = state_size
35
+ self .action_size = action_size
36
+ self .seed = random .seed (random_seed )
37
+
38
+ # Actor Network (w/ Target Network)
39
+ self .actor_local = Actor (state_size , action_size , random_seed ).to (device )
40
+ self .actor_target = Actor (state_size , action_size , random_seed ).to (device )
41
+ self .actor_optimizer = optim .Adam (self .actor_local .parameters (), lr = LR_ACTOR )
42
+
43
+ # Critic Network (w/ Target Network)
44
+ self .critic_local = Critic (state_size , action_size , random_seed ).to (device )
45
+ self .critic_target = Critic (state_size , action_size , random_seed ).to (device )
46
+ self .critic_optimizer = optim .Adam (self .critic_local .parameters (), lr = LR_CRITIC , weight_decay = WEIGHT_DECAY )
47
+
48
+ # Noise process
49
+ self .noise = OUNoise (action_size , random_seed )
50
+
51
+ # Replay memory
52
+ self .memory = ReplayBuffer (action_size , BUFFER_SIZE , BATCH_SIZE , random_seed )
53
+
54
+ def step (self , state , action , reward , next_state , done ):
55
+ """Save experience in replay memory, and use random sample from buffer to learn."""
56
+ # Save experience / reward
57
+ self .memory .add (state , action , reward , next_state , done )
58
+
59
+ # Learn, if enough samples are available in memory
60
+ if len (self .memory ) > BATCH_SIZE :
61
+ experiences = self .memory .sample ()
62
+ self .learn (experiences , GAMMA )
63
+
64
+ def act (self , state , add_noise = True ):
65
+ """Returns actions for given state as per current policy."""
66
+ state = torch .from_numpy (state ).float ().to (device )
67
+ self .actor_local .eval ()
68
+ with torch .no_grad ():
69
+ action = self .actor_local (state ).cpu ().data .numpy ()
70
+ self .actor_local .train ()
71
+ if add_noise :
72
+ action += self .noise .sample ()
73
+ return np .clip (action , - 1 , 1 )
74
+
75
+ def reset (self ):
76
+ self .noise .reset ()
77
+
78
+ def learn (self , experiences , gamma ):
79
+ """Update policy and value parameters using given batch of experience tuples.
80
+ Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
81
+ where:
82
+ actor_target(state) -> action
83
+ critic_target(state, action) -> Q-value
84
+
85
+ Params
86
+ ======
87
+ experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
88
+ gamma (float): discount factor
89
+ """
90
+ states , actions , rewards , next_states , dones = experiences
91
+
92
+ # ---------------------------- update critic ---------------------------- #
93
+ # Get predicted next-state actions and Q values from target models
94
+ actions_next = self .actor_target (next_states )
95
+ Q_targets_next = self .critic_target (next_states , actions_next )
96
+ # Compute Q targets for current states (y_i)
97
+ Q_targets = rewards + (gamma * Q_targets_next * (1 - dones ))
98
+ # Compute critic loss
99
+ Q_expected = self .critic_local (states , actions )
100
+ critic_loss = F .mse_loss (Q_expected , Q_targets )
101
+ # Minimize the loss
102
+ self .critic_optimizer .zero_grad ()
103
+ critic_loss .backward ()
104
+ self .critic_optimizer .step ()
105
+
106
+ # ---------------------------- update actor ---------------------------- #
107
+ # Compute actor loss
108
+ actions_pred = self .actor_local (states )
109
+ actor_loss = - self .critic_local (states , actions_pred ).mean ()
110
+ # Minimize the loss
111
+ self .actor_optimizer .zero_grad ()
112
+ actor_loss .backward ()
113
+ self .actor_optimizer .step ()
114
+
115
+ # ----------------------- update target networks ----------------------- #
116
+ self .soft_update (self .critic_local , self .critic_target , TAU )
117
+ self .soft_update (self .actor_local , self .actor_target , TAU )
118
+
119
+ def soft_update (self , local_model , target_model , tau ):
120
+ """Soft update model parameters.
121
+ θ_target = τ*θ_local + (1 - τ)*θ_target
122
+
123
+ Params
124
+ ======
125
+ local_model: PyTorch model (weights will be copied from)
126
+ target_model: PyTorch model (weights will be copied to)
127
+ tau (float): interpolation parameter
128
+ """
129
+ for target_param , local_param in zip (target_model .parameters (), local_model .parameters ()):
130
+ target_param .data .copy_ (tau * local_param .data + (1.0 - tau )* target_param .data )
131
+
132
+ class OUNoise :
133
+ """Ornstein-Uhlenbeck process."""
134
+
135
+ def __init__ (self , size , seed , mu = 0. , theta = 0.15 , sigma = 0.2 ):
136
+ """Initialize parameters and noise process."""
137
+ self .mu = mu * np .ones (size )
138
+ self .theta = theta
139
+ self .sigma = sigma
140
+ self .seed = random .seed (seed )
141
+ self .reset ()
142
+
143
+ def reset (self ):
144
+ """Reset the internal state (= noise) to mean (mu)."""
145
+ self .state = copy .copy (self .mu )
146
+
147
+ def sample (self ):
148
+ """Update internal state and return it as a noise sample."""
149
+ x = self .state
150
+ dx = self .theta * (self .mu - x ) + self .sigma * np .array ([random .random () for i in range (len (x ))])
151
+ self .state = x + dx
152
+ return self .state
153
+
154
+ class ReplayBuffer :
155
+ """Fixed-size buffer to store experience tuples."""
156
+
157
+ def __init__ (self , action_size , buffer_size , batch_size , seed ):
158
+ """Initialize a ReplayBuffer object.
159
+ Params
160
+ ======
161
+ buffer_size (int): maximum size of buffer
162
+ batch_size (int): size of each training batch
163
+ """
164
+ self .action_size = action_size
165
+ self .memory = deque (maxlen = buffer_size ) # internal memory (deque)
166
+ self .batch_size = batch_size
167
+ self .experience = namedtuple ("Experience" , field_names = ["state" , "action" , "reward" , "next_state" , "done" ])
168
+ self .seed = random .seed (seed )
169
+
170
+ def add (self , state , action , reward , next_state , done ):
171
+ """Add a new experience to memory."""
172
+ e = self .experience (state , action , reward , next_state , done )
173
+ self .memory .append (e )
174
+
175
+ def sample (self ):
176
+ """Randomly sample a batch of experiences from memory."""
177
+ experiences = random .sample (self .memory , k = self .batch_size )
178
+
179
+ states = torch .from_numpy (np .vstack ([e .state for e in experiences if e is not None ])).float ().to (device )
180
+ actions = torch .from_numpy (np .vstack ([e .action for e in experiences if e is not None ])).float ().to (device )
181
+ rewards = torch .from_numpy (np .vstack ([e .reward for e in experiences if e is not None ])).float ().to (device )
182
+ next_states = torch .from_numpy (np .vstack ([e .next_state for e in experiences if e is not None ])).float ().to (device )
183
+ dones = torch .from_numpy (np .vstack ([e .done for e in experiences if e is not None ]).astype (np .uint8 )).float ().to (device )
184
+
185
+ return (states , actions , rewards , next_states , dones )
186
+
187
+ def __len__ (self ):
188
+ """Return the current size of internal memory."""
189
+ return len (self .memory )
0 commit comments