本文整理汇总了Python中theano.tensor.icol函数的典型用法代码示例。如果您正苦于以下问题:Python icol函数的具体用法?Python icol怎么用?Python icol使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了icol函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, args):
reward = T.col('r')
action = T.icol('a')
terminal = T.icol('t')
discount = T.scalar('gamma')
learningRate = T.scalar('lr')
rho = T.scalar('rho')
epsilon = T.scalar('eps')
rng = np.random.RandomState(42)
self.batchNb = args.batchSize
#convLayers = [[(8,8),(4,4),64],
# [(4,4),(2,2),128],
# [(3,3),(1,1),256],
# [(3,3),(1,1),512]]
#fcl = [1024, 6]
convLayers = [[(8,8),(4,4),64],
[(4,4),(2,2),128],
[(3,3),(1,1),256],
[(3,3),(1,1),256]]
fcl = [1024, args.actionNb]
self.q1 = NetStruct(convLayers, fcl, (4,100,100), rng, args)
self.q2 = NetStruct(convLayers, fcl, (4,100,100), rng, args)
self.q2.setParams(self.q1)
self.states = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32'))
self.states2 = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32'))
self.actions = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True))
self.rewards = theano.shared(np.zeros((args.batchSize,1), dtype='float32'), broadcastable=(False,True))
self.terminals = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True))
self.learningRate = theano.shared(np.array(args.learningRate, dtype='float32'))
self.rho = theano.shared(np.array(args.rmsPropRho, dtype='float32'))
self.epsilon = theano.shared(np.array(args.rmsPropEpsilon, dtype='float32'))
self.discount = theano.shared(np.array(args.discountFactor, dtype='float32'))
loss = self.QLoss(self.q1.output, self.q2.output, action, reward, terminal, discount)
params = self.q1.getParams()
updates = self.rmsProp(loss, params, rho, epsilon, learningRate)
self.train_model = theano.function(
[],
loss,
updates=updates,
givens = {
self.q1.input: self.states,
self.q2.input: self.states2,
action: self.actions,
reward: self.rewards,
terminal: self.terminals,
discount: self.discount,
learningRate: self.learningRate,
rho: self.rho,
epsilon: self.epsilon
}
)
开发者ID:Levoila,项目名称:CrappyAI,代码行数:59,代码来源:net.py
示例2: __init__
def __init__(self, lenW, dimW, dimS):
self.W = th.shared(np.random.randn(lenW, dimW))
self.Uw = th.shared(np.random.randn(dimW, dimS))
self.Us = th.shared(np.random.randn(dimS, dimS))
self.V = th.shared(np.random.randn(dimS, lenW))
self.S0 = th.shared(np.random.randn(dimS,))
self.idx = T.icol()
self.w = self.W[self.idx].reshape((self.idx.shape[0], self.W.shape[1]))
def recurrence(w, s):
# import ipdb; ipdb.set_trace()
s1 = T.nnet.sigmoid(T.dot(w, self.Uw))
s2 = T.nnet.sigmoid(T.dot(s, self.Us))
ss = s1 + s2
pp = T.dot(s, self.V)
return [ss, pp]
[self.S, self.PP], _ = th.scan(fn=recurrence, sequences=self.w, outputs_info=[self.S0, None], n_steps=self.w.shape[0])
self.P = T.nnet.softmax(self.PP)
self.RP = self.P[T.arange(self.w.shape[0]), self.idx[:,0]]
self.cost = -T.sum(T.log(self.RP))
self.params = [self.W, self.Uw, self.Us, self.V, self.S0]
self.grads = T.grad(self.cost, self.params)
self.lr = T.scalar()
self.updates = map(lambda (param, grad): (param, param - self.lr * grad), zip(self.params, self.grads))
self.train_fn = th.function([self.idx, self.lr], [self.cost], updates=self.updates, allow_input_downcast=True)
self.fprop = th.function([self.idx], [self.S, self.P, self.cost], allow_input_downcast=True)
开发者ID:sherjilozair,项目名称:daedalus,代码行数:25,代码来源:elman.py
示例3: __init__
def __init__(self, input_width, input_height, output_dim, num_frames, batch_size):
self.input_width = input_width
self.input_height = input_height
self.output_dim = output_dim
self.num_frames = num_frames
self.batch_size = batch_size
self.gamma = 0.99 # discount factor
self.rho = 0.99
self.lr = 0.00025 # learning rate
self.momentum = 0.95
self.freeze_targets = True
self.l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size)
if self.freeze_targets:
self.next_l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size)
self.reset_q_hat()
states = T.tensor4('states')
next_states = T.tensor4('next_states')
rewards = T.col('rewards')
actions = T.icol('actions')
# terminals = T.icol('terminals')
self.states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX))
self.next_states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX))
self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False,True))
self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))
# self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))
q_vals = self.l_out.get_output(states / 255.0)
if self.freeze_targets:
next_q_vals = self.next_l_out.get_output(next_states / 255.0)
else:
next_q_vals = self.l_out.get_output(next_states / 255.0)
next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
target = rewards + self.gamma * T.max(next_q_vals, axis=1, keepdims=True)
diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1,1))
loss = T.mean(diff ** 2)
params = lasagne.layers.helper.get_all_params(self.l_out)
givens = {
states: self.states_shared,
next_states: self.next_states_shared,
rewards: self.rewards_shared,
actions: self.actions_shared,
# terminals: self.terminals_shared
}
if self.momentum > 0:
updates = rmsprop_nesterov(loss, params, self.lr, self.rho, self.momentum, 1e-2)
else:
updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 1e-6)
self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
self._q_vals = theano.function([], q_vals, givens={ states: self.states_shared })
开发者ID:npow,项目名称:deep_q_rl,代码行数:54,代码来源:network.py
示例4: _create_network
def _create_network(self):
logger.info("Building network ...")
net, input_var = self._build_network()
target_values = T.matrix('target_output')
actions = T.icol('actions')
# Create masks
# mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32))
mask = T.zeros_like(target_values)
mask = T.set_subtensor(mask[T.arange(self.batch_size), actions.reshape((-1,))], 1)
# feed-forward path
network_output = lasagne.layers.get_output(net, input_var / 255.0)
# Add regularization penalty
loss = squared_error(network_output * mask, target_values).mean()
if self.weight_decay > 0.0:
loss += regularize_network_params(net, l2) * self.weight_decay
# Retrieve all parameters from the network
all_params = lasagne.layers.get_all_params(net, trainable=True)
# Compute updates for training
if self.clip_error:
grads = theano.gradient.grad(loss, all_params)
grads = [lasagne.updates.norm_constraint(grad, self.clip_error, range(grad.ndim)) for grad in grads]
updates = self.optimizer(grads, all_params, learning_rate=self.learning_rate, rho=self.decay_rate)
else:
updates = self.optimizer(loss, all_params, learning_rate=self.learning_rate, rho=self.decay_rate)
# Theano functions for training and computing cost
logger.info("Compiling functions ...")
train = theano.function([input_var, target_values, actions], [loss, network_output, target_values, mask], updates=updates)
predict = theano.function([input_var], network_output)
return net, train, predict
开发者ID:nikolaypavlov,项目名称:simple_dqn,代码行数:36,代码来源:deep_q_network.py
示例5: __init__
def __init__(self, input_width, input_height, num_actions,
num_frames, discount, learning_rate, rho,
rms_epsilon, momentum, clip_delta, freeze_interval,
batch_size, update_rule,
batch_accumulator, state_count, input_scale=255.0):
self.state_count=state_count
self.input_width = input_width
self.input_height = input_height
self.num_actions = num_actions
self.num_frames = num_frames
self.batch_size = batch_size
self.discount = discount
self.rho = rho
self.lr = learning_rate
self.rms_epsilon = rms_epsilon
self.momentum = momentum
self.clip_delta = clip_delta
self.freeze_interval = freeze_interval
self.update_counter = 0
self.l_out = self.build_nature_network_dnn(input_width, input_height,
num_actions, num_frames, batch_size)
if self.freeze_interval > 0:
self.next_l_out = self.build_nature_network_dnn(input_width,
input_height, num_actions,
num_frames, batch_size)
self.reset_q_hat()
states = T.matrix('states')
next_states = T.matrix('next_states')
rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')
#buferis inputu viso batch
self.states_shared = theano.shared(
np.zeros((batch_size, state_count),
dtype=theano.config.floatX))
#buferis i koki state patenka visiem
self.next_states_shared = theano.shared(
np.zeros((batch_size, state_count),
dtype=theano.config.floatX))
#po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu?
self.rewards_shared = theano.shared(
np.zeros((batch_size, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
#po 1 priimta action kiekvienam episode
self.actions_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
#?? turbut 0 ir 1, ar paskutine verte ar ne
self.terminals_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
#paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui
q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
if self.freeze_interval > 0:
next_q_vals = lasagne.layers.get_output(self.next_l_out,
next_states / input_scale)
else:
next_q_vals = lasagne.layers.get_output(self.l_out,
next_states / input_scale)
next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
target = (rewards +
(T.ones_like(terminals) - terminals) *
self.discount * T.max(next_q_vals, axis=1, keepdims=True))
diff = target - q_vals[T.arange(batch_size),
actions.reshape((-1,))].reshape((-1, 1))
#neaisku
if self.clip_delta > 0:
diff = diff.clip(-self.clip_delta, self.clip_delta)
if batch_accumulator == 'sum':
loss = T.sum(diff ** 2)
elif batch_accumulator == 'mean':
loss = T.mean(diff ** 2)
else:
raise ValueError("Bad accumulator: {}".format(batch_accumulator))
#
params = lasagne.layers.helper.get_all_params(self.l_out)
givens = {
states: self.states_shared,
next_states: self.next_states_shared,
rewards: self.rewards_shared,
actions: self.actions_shared,
terminals: self.terminals_shared
}
#.........这里部分代码省略.........
开发者ID:navd,项目名称:AlgoTrading,代码行数:101,代码来源:q_network.py
示例6: initialize_network
def initialize_network(self):
"""
:description: this method initializes the network, updates, and theano functions for training and
retrieving q values. Here's an outline:
1. build the q network and target q network
2. initialize theano symbolic variables used for compiling functions
3. initialize the theano numeric variables used as input to functions
4. formulate the symbolic loss
5. formulate the symbolic updates
6. compile theano functions for training and for getting q_values
"""
batch_size, input_shape = self.batch_size, self.input_shape
lasagne.random.set_rng(self.rng)
# 1. build the q network and target q network
self.l_out = self.build_network(input_shape, self.num_actions, batch_size)
self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size)
self.reset_target_network()
# 2. initialize theano symbolic variables used for compiling functions
states = T.tensor4('states')
actions = T.icol('actions')
rewards = T.col('rewards')
next_states = T.tensor4('next_states')
# terminals are used to indicate a terminal state in the episode and hence a mask over the future
# q values i.e., Q(s',a')
terminals = T.icol('terminals')
# 3. initialize the theano numeric variables used as input to functions
self.states_shape = (batch_size,) + (1,) + input_shape
self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
# 4. formulate the symbolic loss
q_vals = lasagne.layers.get_output(self.l_out, states)
next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
target = (rewards +
(T.ones_like(terminals) - terminals) *
self.discount * T.max(next_q_vals, axis=1, keepdims=True))
# reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
# a lot of the deepmind work clips the td error at 1 so we do that here
# the problem is that gradient backpropagating through this minimum node
# will be zero if diff is larger then 1.0 (because changing params before
# the minimum does not impact the output of the minimum). To account for
# this we take the part of the td error (magnitude) greater than 1.0 and simply
# add it to the loss, which allows gradient to backprop but just linearly
# in the td error rather than quadratically
quadratic_part = T.minimum(abs(diff), 1.0)
linear_part = abs(diff) - quadratic_part
loss = 0.5 * quadratic_part ** 2 + linear_part
loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2)
# 5. formulate the symbolic updates
params = lasagne.layers.helper.get_all_params(self.l_out)
updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate)
# 6. compile theano functions for training and for getting q_values
givens = {
states: self.states_shared,
next_states: self.next_states_shared,
rewards: self.rewards_shared,
actions: self.actions_shared,
terminals: self.terminals_shared
}
self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
开发者ID:gandalfvn,项目名称:hierarchical_rl,代码行数:76,代码来源:qnetwork.py
示例7: __init__
#.........这里部分代码省略.........
self.q_layers.append(
layers.DenseLayer(self.q_layers[-1],
n_outputs=num_actions,
weights_std=0.01,
init_bias_value=0.1,
dropout=0,
nonlinearity=layers.identity))
if approximator == 'none':
self.q_layers.append(\
layers.DenseLayerNoBias(self.q_layers[-1],
n_outputs=num_actions,
weights_std=0.00,
dropout=0,
nonlinearity=layers.identity))
self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))
for i in range(len(self.q_layers)-1):
print self.q_layers[i].get_output_shape()
# Now create a network (using the same weights)
# for next state q values
self.next_layers = copy_layers(self.q_layers)
self.next_layers[0] = layers.Input2DLayer(self._batch_size,
self._num_input_features,
self._img_width,
self._img_height,
self.scale_input_by)
self.next_layers[1].input_layer = self.next_layers[0]
self.rewards = T.col()
self.actions = T.icol()
# Build the loss function ...
q_vals = self.q_layers[-1].predictions()
next_q_vals = self.next_layers[-1].predictions()
next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
target = self.rewards + discount * next_maxes
target = theano.gradient.consider_constant(target)
diff = target - q_vals
# Zero out all entries for actions that were not chosen...
mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
diff_masked = diff * mask
error = T.mean(diff_masked ** 2)
self._loss = error * diff_masked.shape[1] #
self._parameters = layers.all_parameters(self.q_layers[-1])
self._idx = T.lscalar('idx')
# CREATE VARIABLES FOR INPUT AND OUTPUT
self.states_shared = theano.shared(
np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
self.states_shared_next = theano.shared(
np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
self.rewards_shared = theano.shared(
np.zeros((1, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(
np.zeros((1, 1), dtype='int32'), broadcastable=(False, True))
self._givens = \
{self.q_layers[0].input_var:
self.states_shared[self._idx*self._batch_size:
(self._idx+1)*self._batch_size, :, :, :],
self.next_layers[0].input_var:
self.states_shared_next[self._idx*self._batch_size:
(self._idx+1)*self._batch_size, :, :, :],
self.rewards:
self.rewards_shared[self._idx*self._batch_size:
(self._idx+1)*self._batch_size, :],
self.actions:
self.actions_shared[self._idx*self._batch_size:
(self._idx+1)*self._batch_size, :]
}
if self.momentum != 0:
self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
self._loss, self._parameters, learning_rate=self.learning_rate,
rho=self.decay, momentum=self.momentum, epsilon=1e-6)
else:
self._updates = layers.gen_updates_rmsprop(self._loss,
self._parameters, learning_rate=self.learning_rate,
rho=self.decay, epsilon=1e-6)
self._train = theano.function([self._idx], self._loss,
givens=self._givens,
updates=self._updates)
self._compute_loss = theano.function([self._idx],
self._loss,
givens=self._givens)
self._compute_q_vals = \
theano.function([self.q_layers[0].input_var],
self.q_layers[-1].predictions(),
on_unused_input='ignore')
开发者ID:akansal1,项目名称:einstein,代码行数:101,代码来源:cnn_q_learner.py
示例8: test_git_on_gip
def test_git_on_gip(hyper_params=None, rng_seed=1234):
assert(not (hyper_params is None))
# Initialize a source of randomness
rng = np.random.RandomState(rng_seed)
sup_count = 100
# Load some data to train/validate/test with
dataset = 'data/mnist.pkl.gz'
datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False)
Xtr_su = datasets[0][0].get_value(borrow=False)
Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32)
Xtr_un = datasets[1][0].get_value(borrow=False)
Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32)
# get the joint labeled and unlabeled data
Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX)
Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]])
# get the labeled data
Xtr_su = Xtr_su.astype(theano.config.floatX)
Ytr_su = Ytr_su[:,np.newaxis]
# get observations and labels for the validation set
Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX)
Yva = datasets[2][1].get_value(borrow=False).astype(np.int32)
Yva = Yva[:,np.newaxis] # numpy is dumb
# get size information for the data
un_samples = Xtr_un.shape[0]
su_samples = Xtr_su.shape[0]
va_samples = Xva.shape[0]
# set up some symbolic variables for input/output
Xp = T.matrix('Xp_base')
Xd = T.matrix('Xd_base')
Xc = T.matrix('Xc_base')
Xm = T.matrix('Xm_base')
Yd = T.icol('Yd_base')
# set some "shape" parameters for the networks
data_dim = Xtr_un.shape[1]
label_dim = 10
prior_1_dim = 50
prior_2_dim = 50
prior_sigma = 1.0
batch_size = 100
##################
# SETUP A GIPAIR #
##################
gn1_params = {}
gn1_config = [prior_1_dim, 600, 600, data_dim]
gn1_params['mlp_config'] = gn1_config
gn1_params['activation'] = softplus_actfun
gn1_params['out_type'] = 'bernoulli'
gn1_params['lam_l2a'] = 1e-3
gn1_params['vis_drop'] = 0.0
gn1_params['hid_drop'] = 0.0
gn1_params['bias_noise'] = 0.1
# choose some parameters for the continuous inferencer
in1_params = {}
shared_config = [data_dim, 600, 600]
top_config = [shared_config[-1], prior_1_dim]
in1_params['shared_config'] = shared_config
in1_params['mu_config'] = top_config
in1_params['sigma_config'] = top_config
in1_params['activation'] = softplus_actfun
in1_params['lam_l2a'] = 1e-3
in1_params['vis_drop'] = 0.0
in1_params['hid_drop'] = 0.0
in1_params['bias_noise'] = 0.1
in1_params['input_noise'] = 0.0
# Initialize the base networks for this GIPair
IN1 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \
params=in1_params, shared_param_dicts=None)
GN1 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \
params=gn1_params, shared_param_dicts=None)
# Initialize biases in IN and GN
IN1.init_biases(0.0)
GN1.init_biases(0.0)
# Initialize the GIPair
GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN1, i_net=IN1, \
data_dim=data_dim, prior_dim=prior_1_dim, \
params=None, shared_param_dicts=None)
# Set cost weighting parameters
GIP.set_lam_nll(1.0)
GIP.set_lam_kld(1.0)
GIP.set_lam_l2w(1e-4)
##################
# SETUP A GITRIP #
##################
# set parameters for the generator network
gn2_params = {}
gn2_config = [(prior_2_dim + label_dim), 300, prior_1_dim]
gn2_params['mlp_config'] = gn2_config
gn2_params['activation'] = softplus_actfun
gn2_params['out_type'] = 'gaussian'
gn2_params['lam_l2a'] = 1e-3
gn2_params['vis_drop'] = 0.0
gn2_params['hid_drop'] = 0.0
gn2_params['bias_noise'] = 0.1
# choose some parameters for the continuous inferencer
in2_params = {}
#.........这里部分代码省略.........
开发者ID:darcy0511,项目名称:NN-Python,代码行数:101,代码来源:MnistTests.py
示例9: __init__
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type,
update_rule, batch_accumulator, randomState, frame_scale=255.0):
""" Initialize environment
Arguments:
environment - the environment (class Env)
num_elements_in_batch - list of k integers for the number of each element kept as belief state
num_actions - int
discount - float
learning_rate - float
rho, rms_epsilon, momentum - float, float, float
...
network_type - string
...
"""
self._environment = environment
self._batchSize = batchSize
self._inputDimensions = self._environment.inputDimensions()
self._nActions = self._environment.nActions()
self._df = 0
self.rho = rho
self._lr = 0
self.rms_epsilon = rms_epsilon
self.momentum = momentum
self.clip_delta = clip_delta
self.freeze_interval = freeze_interval
self._randomState = randomState
lasagne.random.set_rng(self._randomState)
self.update_counter = 0
states=[] # list of symbolic variables for each of the k element in the belief state
# --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
next_states=[] # idem than states at t+1
self.states_shared=[] # list of shared variable for each of the k element in the belief state
self.next_states_shared=[] # idem that self.states_shared at t+1
for i, dim in enumerate(self._inputDimensions):
if len(dim) == 3:
states.append(T.tensor4("%s_%s" % ("state", i)))
next_states.append(T.tensor4("%s_%s" % ("next_state", i)))
elif len(dim) == 2:
states.append(T.tensor3("%s_%s" % ("state", i)))
next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
elif len(dim) == 1:
states.append( T.matrix("%s_%s" % ("state", i)) )
next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
print("Number of observations per state: {}".format(len(self.states_shared)))
print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions))
rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')
thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states)
print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))
self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states)
self._resetQHat()
self.rewards_shared = theano.shared(
np.zeros((batchSize, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(
np.zeros((batchSize, 1), dtype='int32'),
broadcastable=(False, True))
self.terminals_shared = theano.shared(
np.zeros((batchSize, 1), dtype='int32'),
broadcastable=(False, True))
q_vals = lasagne.layers.get_output(self.l_out)
next_q_vals = lasagne.layers.get_output(self.next_l_out)
max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True)
T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)
target = rewards + T_ones_like * thediscount * max_next_q_vals
q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1))
diff = target - q_val
if self.clip_delta > 0:
#.........这里部分代码省略.........
开发者ID:Gzzgz,项目名称:General_Deep_Q_RL,代码行数:101,代码来源:q_net_lasagne.py
示例10: setup
def setup(self):
lasagne.random.set_rng(self.rng)
self.update_counter = 0
self.l_out = self.build_q_network()
states = T.tensor3('states')
next_states = T.tensor3('next_states')
rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')
# Shared variables for training from a minibatch of replayed
# state transitions, each consisting of an observation,
# along with the chosen action and resulting
# reward and terminal status.
self.states_shared = theano.shared(
np.zeros((self.batch_size, self.input_height, self.input_width), dtype=theano.config.floatX))
self.rewards_shared = theano.shared(
np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(
np.zeros((self.batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.terminals_shared = theano.shared(
np.zeros((self.batch_size, 1), dtype='int32'),
broadcastable=(False, True))
# Shared variable for a single state, to calculate q_vals.
self.state_shared = theano.shared(
np.zeros((self.input_height, self.input_width), dtype=theano.config.floatX))
# Formulas
q_vals = lasagne.layers.get_output(self.l_out, states / self.input_scale)
next_q_vals = lasagne.layers.get_output(self.l_out, next_states / self.input_scale)
next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
terminalsX = terminals.astype(theano.config.floatX)
action_mask = T.eq(T.arange(self.num_actions).reshape((1, -1)),
actions.reshape((-1, 1))).astype(theano.config.floatX)
target = (rewards +
(T.ones_like(terminalsX) - terminalsX) *
self.discount * T.max(next_q_vals, axis=1, keepdims=True))
output = (q_vals * action_mask).sum(axis=1).reshape((-1, 1))
diff = target - output
loss = 0.5 * diff ** 2
loss = T.sum(loss)
#loss = T.mean(loss)
# Params and givens
params = lasagne.layers.helper.get_all_params(self.l_out)
updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon)
train_givens = {
states: self.states_shared[:, :-1],
next_states: self.imgs_shared[:, 1:],
rewards: self.rewards_shared,
actions: self.actions_shared,
terminals: self.terminals_shared
}
self._train = theano.function([], [loss], updates=updates,
givens=train_givens)
q_givens = {
states: self.state_shared.reshape((1,
self.input_height,
self.input_width))
}
self._q_vals = theano.function([], q_vals[0], givens=q_givens)
开发者ID:mortennp,项目名称:misc,代码行数:71,代码来源:deep_q_agent_lasagne.py
示例11: __init__
def __init__(self, input_width, input_height, avail_actions, num_actions,
num_frames, discount, learning_rate, rho,
rms_epsilon, momentum, clip_delta, freeze_interval,
batch_size, network_type, update_rule,
batch_accumulator, rng, train_all, input_scale=255.0):
self.input_width = input_width
self.input_height = input_height
self.avail_actions = avail_actions
self.num_actions = num_actions
self.num_frames = num_frames
self.batch_size = batch_size
self.discount = discount
self.rho = rho
self.lr = learning_rate
self.rms_epsilon = rms_epsilon
self.momentum = momentum
self.clip_delta = clip_delta
self.freeze_interval = freeze_interval
self.rng = rng
self.train_all = train_all
lasagne.random.set_rng(self.rng)
self.update_counter = 0
print "num_actions: " + str(num_actions)
self.l_out = self.build_network(network_type, input_width, input_height,
num_actions, num_frames, batch_size)
if self.freeze_interval > 0:
self.next_l_out = self.build_network(network_type, input_width,
input_height, num_actions,
num_frames, batch_size)
self.reset_q_hat()
states = T.tensor4('states')
next_states = T.tensor4('next_states')
rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')
self.states_shared = theano.shared(
np.zeros((batch_size, num_frames, input_height, input_width),
dtype=theano.config.floatX))
self.next_states_shared = theano.shared(
np.zeros((batch_size, num_frames, input_height, input_width),
dtype=theano.config.floatX))
self.rewards_shared = theano.shared(
np.zeros((batch_size, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.terminals_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
if self.freeze_interval > 0:
next_q_vals = lasagne.layers.get_output(self.next_l_out,
next_states / input_scale)
else:
next_q_vals = lasagne.layers.get_output(self.l_out,
next_states / input_scale)
next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
target = (rewards +
(T.ones_like(terminals) - terminals) *
self.discount * T.max(next_q_vals, axis=1, keepdims=True))
diff = target - q_vals[T.arange(batch_size),
actions.reshape((-1,))].reshape((-1, 1))
if self.clip_delta > 0:
# If we simply take the squared clipped diff as our loss,
# then the gradient will be zero whenever the diff exceeds
# the clip bounds. To avoid this, we extend the loss
# linearly past the clip point to keep the gradient constant
# in that regime.
#
# This is equivalent to declaring d loss/d q_vals to be
# equal to the clipped diff, then backpropagating from
# there, which is what the DeepMind implementation does.
quadratic_part = T.minimum(abs(diff), self.clip_delta)
linear_part = abs(diff) - quadratic_part
loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
else:
loss = 0.5 * diff ** 2
if batch_accumulator == 'sum':
loss = T.sum(loss)
elif batch_accumulator == 'mean':
loss = T.mean(loss)
else:
raise ValueError("Bad accumulator: {}".format(batch_accumulator))
#.........这里部分代码省略.........
开发者ID:cowhi,项目名称:deep_q_rl,代码行数:101,代码来源:q_network.py
示例12: __init__
def __init__(self, rng=None, Xd=None, \
g_net=None, i_net=None, pn_seq=None, \
data_dim=None, prior_dim=None, \
params=None):
# setup a rng for this AEDPair
self.rng = RandStream(rng.randint(100000))
if (params is None):
self.params = {}
else:
self.params = params
if 'match_type' in params:
self.match_type = params['match_type']
else:
self.match_type = 'grad_sign'
# we can only try to match sign or direction...
assert((self.match_type == 'grad_dir') or \
(self.match_type == 'grad_sign'))
if self.match_type == 'grad_dir':
# we match the direction of the gradient under the assumption
# of gaussian observation noise
self.mean_transform = lambda x: max_normalize(x, axis=1)
assert(g_net.out_type == 'gaussian')
else:
# we match the sign of the gradient as if it were a collection
# of independent binary variables
self.mean_transform = lambda x: 2.0 * (x - 0.5)
assert(g_net.out_type == 'bernoulli')
# record the symbolic variables that will provide inputs to the
# computation graph created to describe this AEDPair
self.Xd = Xd
self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq
self.Xc = 0.0 * self.Xd
self.Xm = 0.0 * self.Xd
self.obs_count = T.cast(Xd.shape[0], 'floatX')
# create a "shared-parameter" clone of the inferencer, set up to
# receive input from the appropriate symbolic variables.
self.IN = i_net.shared_param_clone(rng=rng, \
Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)
self.policy_mean = self.IN.output_mean
self.policy_logvar = self.IN.output_logvar
# capture a handle for samples from the variational posterior
self.Xp = self.IN.output
# create a "shared-parameter" clone of the generator, set up to
# receive input from samples from the variational posterior
self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
# set up a var for controlling the max-norm bound on perturbations
zero_ary = np.zeros((1,)).astype(theano.config.floatX)
self.lam_mnb = theano.shared(value=zero_ary, \
name='adp_lam_mnb')
self.set_lam_mnb(lam_mnb=0.1)
# get the perturbations output by the generator network
self.Pg = self.mean_transform(self.GN.output)
if self.match_type == 'grad_dir':
# samples because we're matching gradient via squared error
self.Pg_samples = self.mean_transform(self.GN.output_samples)
else:
# no samples, because we're matching gradient sign
self.Pg_samples = self.mean_transform(self.GN.output)
# record and validate the data dimensionality parameters
self.data_dim = data_dim
self.prior_dim = prior_dim
# output of the generator and input to the inferencer should both be
# equal to self.data_dim
assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
assert(self.data_dim == self.IN.shared_layers[0].in_dim)
# input of the generator and mu/sigma outputs of the inferencer should
# both be equal to self.prior_dim
assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)
# make a clone of the target PeaNetSeq that takes perturbed inputs
self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \
seq_Xd=[self.Xd, self.Xd], seq_Yd=[self.Yd, self.Yd], \
no_funcs=True)
self.grad_pea_Xd = T.grad(self.PNS.joint_cost, self.Xd)
if self.match_type == 'grad_dir':
# turn gradient into a unit max-normalized vector
self.match_target = max_normalize(self.grad_pea_Xd)
else:
# transform gradient into binary indicators of sign
self.match_target = (self.grad_pea_Xd > 0.0)
# get the symbolic vars for passing inputs to self.PNS
self.Xd_seq = self.PNS.Xd_seq
self.Yd_seq = self.PNS.Yd_seq
self.seq_inputs = self.Xd_seq + self.Yd_seq
# shared var learning rate for generator and inferencer
self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn')
self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in')
# shared var momentum parameters for generator and inferencer
self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1')
self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2')
self.it_count = theano.shared(value=zero_ary, name='adp_it_count')
# init parameters for controlling learning dynamics
#.........这里部分代码省略.........
开发者ID:Philip-Bachman,项目名称:NN-Python,代码行数:101,代码来源:AEDPair.py
示例13: __init__
def __init__(self, num_actions):
# remember parameters
self.num_actions = num_actions
self.batch_size = BATCH_SIZE
self.discount_rate = DISCOUNT_RATE
self.history_length = HISTORY_LENGTH
self.screen_dim = DIMS
self.img_height = SCREEN_HEIGHT
self.img_width = SCREEN_WIDTH
self.clip_error = CLIP_ERROR
self.input_color_scale = COLOR_SCALE
self.target_steps = TARGET_STEPS
self.train_iterations = TRAIN_STEPS
self.train_counter = 0
self.momentum = MOMENTUM
self.update_rule = UPDATE_RULE
self.learning_rate = LEARNING_RATE
self.rms_decay = RMS_DECAY
self.rms_epsilon = RMS_EPSILON
self.rng = np.random.RandomState(RANDOM_SEED)
# set seed
lasagne.random.set_rng(self.rng)
# prepare tensors once and reuse them
states = T.tensor4('states')
next_states = T.tensor4('next_states')
rewards = T.col('rewards')
actions = T.icol('actions')
# terminals are bool for our case
terminals = T.bcol('terminals')
# create shared theano variables
self.states_shared = theano.shared(
np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
dtype=theano.config.floatX))
self.next_states_shared = theano.shared(
np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
dtype=theano.config.floatX))
# !broadcast ?
self.rewards_shared = theano.shared(
np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(
np.zeros((self.batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.terminals_shared = theano.shared(
#np.zeros((self.batch_size, 1), dtype='int32'),
np.zeros((self.batch_size, 1), dtype='int8'),
broadcastable=(False, True))
# can add multiple nets here
self.l_primary = self.build_network()
if self.target_steps > 0:
self.l_secondary = self.build_network()
self.copy_to_secondary()
"""
# input scale i.e. division can be applied to input directly also to normalize
"""
# define output symbols
q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale)
if self.target_steps > 0:
q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale)
else:
# why this ?
q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale)
q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary)
# target = r + max
target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True))
"""
# check what this does
"""
diff = target - q_vals[T.arange(self.batch_size),
actions.reshape((-1,))].reshape((-1, 1))
# print shape ?
if self.clip_error > 0:
# If we simply take the squared clipped diff as our loss,
# then the gradient will be zero whenever the diff exceeds
# the clip bounds. To avoid this, we extend the loss
# linearly past the clip point to keep the gradient constant
# in that regime.
#
# This is equivalent to declaring d loss/d q_vals to be
# equal to the clipped diff, then backpropagating from
#.........这里部分代码省略.........
开发者ID:hercky,项目名称:a3c,代码行数:101,代码来源:network.py
|
请发表评论