-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathrun_dqn_atari.py
executable file
·133 lines (112 loc) · 4.12 KB
/
run_dqn_atari.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import argparse
import gym
from gym import wrappers
import os.path as osp
import random
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
import dqn
from dqn_utils import *
from atari_wrappers import *
def atari_model(img_in, num_actions, scope, reuse=False):
# as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
with tf.variable_scope(scope, reuse=reuse):
out = img_in
with tf.variable_scope("convnet"):
# original architecture
out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
out = layers.flatten(out)
with tf.variable_scope("action_value"):
out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
return out
def atari_learn(env,
session,
num_timesteps):
# This is just a rough estimate
num_iterations = float(num_timesteps) / 4.0
lr_multiplier = 1.0
lr_schedule = PiecewiseSchedule([
(0, 1e-4 * lr_multiplier),
(num_iterations / 10, 1e-4 * lr_multiplier),
(num_iterations / 2, 5e-5 * lr_multiplier),
],
outside_value=5e-5 * lr_multiplier)
optimizer = dqn.OptimizerSpec(
constructor=tf.train.AdamOptimizer,
kwargs=dict(epsilon=1e-4),
lr_schedule=lr_schedule
)
def stopping_criterion(env, t):
# notice that here t is the number of steps of the wrapped env,
# which is different from the number of steps in the underlying env
return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
exploration_schedule = PiecewiseSchedule(
[
(0, 1.0),
(1e6, 0.1),
(num_iterations / 2, 0.01),
], outside_value=0.01
)
dqn.learn(
env,
q_func=atari_model,
optimizer_spec=optimizer,
session=session,
exploration=exploration_schedule,
stopping_criterion=stopping_criterion,
replay_buffer_size=1000000,
batch_size=32,
gamma=0.99,
learning_starts=50000,
learning_freq=4,
frame_history_len=4,
target_update_freq=10000,
grad_norm_clipping=10
)
env.close()
def get_available_gpus():
from tensorflow.python.client import device_lib
local_device_protos = device_lib.list_local_devices()
return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
def set_global_seeds(i):
try:
import tensorflow as tf
except ImportError:
pass
else:
tf.set_random_seed(i)
np.random.seed(i)
random.seed(i)
def get_session():
tf.reset_default_graph()
tf_config = tf.ConfigProto(
inter_op_parallelism_threads=1,
intra_op_parallelism_threads=1)
session = tf.Session(config=tf_config)
print("AVAILABLE GPUS: ", get_available_gpus())
return session
def get_env(task, seed):
env_id = task.env_id
env = gym.make(env_id)
set_global_seeds(seed)
env.seed(seed)
expt_dir = '/tmp/hw3_vid_dir2/'
env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
env = wrap_deepmind(env)
return env
def main():
# Get Atari games.
benchmark = gym.benchmark_spec('Atari40M')
# Change the index to select a different game.
task = benchmark.tasks[3]
# Run training
seed = 0 # Use a seed of zero (you may want to randomize the seed!)
env = get_env(task, seed)
session = get_session()
atari_learn(env, session, num_timesteps=task.max_timesteps)
if __name__ == "__main__":
main()