-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
83 lines (64 loc) · 3.6 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gym
from tensorboardX import SummaryWriter
from dqn import DQN
import numpy as np
from tqdm import tqdm
if __name__ == "__main__":
EPOCHS = 200
learning_rate = 0.001 # learning rate
discount = 0.97 # discount (aka gamma) used in the formula to calculate target value
update_target_every = 200 # copy weight to target net every x amount of epochs
# hyper parameters for exploration vs exploitation
decay_exp = 0.001
exploration_rate = 1
min_exp_rate = 0.01
MODEL_NAME = "test"
scores = []
best_score = 0
env = gym.make("CartPole-v0") # initialize the enviroment
env = gym.wrappers.Monitor(env, "recording", video_callable=lambda episode_id: True, force=True) # recording video of the agent for every episode
player = DQN(
input_shape=env.observation_space.shape,
output_shape=env.action_space.n,
discount=discount,
learning_rate=learning_rate,
sample_size=32,
update_target_every=update_target_every
) # initialize the DQN
# player.load(MODEL_NAME, load_target_net=True)
writer = SummaryWriter(f"runs/{MODEL_NAME}") # initialize writer for the graph
writer.add_hparams({"learning rate": learning_rate, "discount": discount, "target update": update_target_every}, {}, MODEL_NAME) # adding hyper parameters
for epoch in range(EPOCHS):
score = 0
state = env.reset()
done = False
while not done:
# exploration vs exploitation
if np.random.uniform() > exploration_rate:
action = player.play(state.reshape(1, 4)) # action from policy net
action_taken = "POLICY" # saving in a variable the type of action taken to then print it
else:
action = env.action_space.sample() # random action
action_taken = "RANDOM" # saving in a variable the type of action taken to then print it
next_state, reward, done, info = env.step(action) # taking action in the enviroment
score += reward # add reward to the total score of the round
player.update((state, action, reward, next_state, done)) # updating player memory
sample = player.create_sample() # create a random sample from the memory
# if we have enough data in memory we can create a sample
if sample:
player.train(sample)
state = next_state # setting the next state as the current state
if not epoch % 10:
player.save(MODEL_NAME) # save the model both policy and target every 10 epochs
exploration_rate = min_exp_rate + (exploration_rate - min_exp_rate) * np.exp(-decay_exp*epoch) # changing exploration rate
scores.append(score) # appending score to scores list
avg_score = np.mean(scores[max(0, epoch-100):(epoch+1)]) # calcolate avg score
writer.add_scalar("score", score, epoch) # appending score to the graph
writer.add_scalar("average score", avg_score, epoch) # appending average score to the graph
if score > best_score:
best_score = score # update the best score
player.save(MODEL_NAME) # save the model both policy and target net every time it beats the score
# printing epoch, type of action taken, score, average score, best score
print("epoch: ", epoch, ", ", action_taken, ", score: %.2f" % score, ", average score: %.2f" % avg_score,
", best score: %.2f" % best_score, sep="")
player.save(MODEL_NAME) # save the model both policy and target