def BuildLoop(self, pred, body, loop_vars):
"""Add the loop termination condition and body to the graph."""
loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
# Let the context know the loop variabes so the _Enter nodes below
# would be added into the context correctly.
self._values = set([ for x in loop_vars])
if self._outer_context is not None:
real_vars = [self._outer_context.AddValue(x) for x in loop_vars]
real_vars = loop_vars
enter_vars = [_Enter(x, self._name, is_constant=False,
for x in real_vars]
self._values = set([ for x in enter_vars])
merge_vars = [merge([x, x])[0] for x in enter_vars]
self._pivot_for_pred = merge_vars[0]
# Build the graph for pred.
c = ops.convert_to_tensor(pred(*merge_vars))
self._pivot = loop_cond(c, name="LoopCond")
switch_vars = [_SwitchRefOrTensor(x, self._pivot) for x in merge_vars]
# Build the graph for body.
vars_for_body = [_Identity(x[1]) for x in switch_vars]
self._pivot_for_body = vars_for_body[0]
body_result = body(*vars_for_body)
if not isinstance(body_result, (list, _basetuple)):
body_result = [body_result]
result = ops.convert_n_to_tensor_or_indexed_slices(body_result)
next_vars = [next_iteration(x) for x in result]
# Add the back edges to complete the loop.
assert len(merge_vars) == len(next_vars)
for x in zip(merge_vars, next_vars):
x[0].op._update_input(1, x[1])
# Add the exit ops.
exit_vars = [exit(x[0]) for x in switch_vars]
for m_var, n_var, e_var in zip(merge_vars, next_vars, exit_vars):
if m_var.get_shape().is_compatible_with(n_var.get_shape()):
# Exit the loop.
return exit_vars[0] if len(exit_vars) == 1 else exit_vars
def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
"""Fill in default values for grad_ys.
grad_ys: List of gradients, can contain None.
ys: List of tensors.
colocate_gradients_with_ops: If True, try colocating gradients with
the corresponding op.
A list of gradients to use, without None.
ValueError: If one of the grad_ys is invalid.
if len(grad_ys) != len(ys):
raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys)))
grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
for i in xrange(len(grad_ys)):
grad_y = grad_ys[i]
y = ys[i]
if grad_y is None:
with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
grad_ys[i] = array_ops.fill(
array_ops.shape(y), constant_op.constant(
1, dtype=y.dtype))
if grad_y.dtype != y.dtype:
raise ValueError("Y and ys_grad must be of the same type, "
"not y: %s, ys_grad: %s " %
return grad_ys
def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None, capacity=32, name=None):
"""Produces a slice of each `Tensor` in `tensor_list`.
Implemented using a Queue -- a `QueueRunner` for the Queue
is added to the current `Graph`'s `QUEUE_RUNNER` collection.
tensor_list: A list of `Tensor` objects. Every `Tensor` in
`tensor_list` must have the same size in the first dimension.
num_epochs: An integer (optional). If specified, `slice_input_producer`
produces each slice `num_epochs` times before generating
an `OutOfRange` error. If not specified, `slice_input_producer` can cycle
through the slices an unlimited number of times.
seed: An integer (optional). Seed used if shuffle == True.
capacity: An integer. Sets the queue capacity.
name: A name for the operations (optional).
A list of tensors, one for each element of `tensor_list`. If the tensor
in `tensor_list` has shape `[N, a, b, .., z]`, then the corresponding output
tensor will have shape `[a, b, ..., z]`.
with ops.op_scope(tensor_list, name, "input_producer"):
tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensor_list)
if not tensor_list:
raise ValueError("Expected at least one tensor in slice_input_producer().")
range_size = array_ops.shape(tensor_list[0])[0]
# TODO(josh11b): Add an assertion that the first dimension of
# everything in TensorList matches. Maybe just check the inferred shapes?
queue = range_input_producer(range_size, num_epochs=num_epochs, shuffle=shuffle, seed=seed, capacity=capacity)
index = queue.dequeue()
output = [array_ops.gather(t, index) for t in tensor_list]
return output
def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
"""Fill in default values for grad_ys.
grad_ys: List of gradients, can contain None.
ys: List of tensors.
colocate_gradients_with_ops: If True, try colocating gradients with
the corresponding op.
A list of gradients to use, without None.
ValueError: If sizes of gradients and inputs don't match
TypeError: If type of any gradient is not valid for its input.
if len(grad_ys) != len(ys):
raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys)))
grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
for i in xrange(len(grad_ys)):
grad_y = grad_ys[i]
y = ys[i]
if grad_y is None:
if y.dtype.is_complex:
raise TypeError(
"Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
grad_ys[i] = array_ops.fill(
array_ops.shape(y), constant_op.constant(
1, dtype=y.dtype))
if y.dtype.is_floating or y.dtype.is_integer:
if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
raise TypeError("Gradient type %s generated for real or "
"integer-valued tensor %s with type %s must be "
"real or integer" %
(dtypes.as_dtype(grad_y.dtype).name, y,
elif y.dtype.is_complex:
if not grad_y.dtype.is_complex:
raise TypeError("Gradient type %s generated for complex-valued "
"tensor %s with type %s must be real" %
(dtypes.as_dtype(grad_y.dtype).name, y,
raise TypeError("Tensor %s with type %s must be numeric "
"to obtain a default gradient" %
(y, dtypes.as_dtype(y.dtype).name))
return grad_ys
def rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1,
enqueue_many=False, prebatch_capacity=16,
prebatch_threads=1, runtime_checks=False, name=None):
"""Stochastically creates batches by rejection sampling.
Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce
a scalar tensor between 0 and 1. This tensor corresponds to the probability of
being accepted. When `batch_size` tensor groups have been accepted, the batch
queue will return a mini-batch.
tensors: List of tensors for data. All tensors are either one item or a
batch, according to enqueue_many.
accept_prob_fn: A python lambda that takes a non-batch tensor from each
item in `tensors`, and produces a scalar tensor.
batch_size: Size of batch to be returned.
queue_threads: The number of threads for the queue that will hold the final
enqueue_many: Bool. If true, interpret input tensors as having a batch
prebatch_capacity: Capacity for the large queue that is used to convert
batched tensors to single examples.
prebatch_threads: Number of threads for the large queue that is used to
convert batched tensors to single examples.
runtime_checks: Bool. If true, insert runtime checks on the output of
`accept_prob_fn`. Using `True` might have a performance impact.
name: Optional prefix for ops created by this function.
ValueError: enqueue_many is True and labels doesn't have a batch
dimension, or if enqueue_many is False and labels isn't a scalar.
ValueError: enqueue_many is True, and batch dimension on data and labels
don't match.
ValueError: if a zero initial probability class has a nonzero target
A list of tensors of the same length as `tensors`, with batch dimension
# Get tensor for a single data and label example.
data, label = data_provider.Get(['data', 'label'])
# Get stratified batch according to data tensor.
accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2
data_batch =
[data, label], accept_prob_fn, 16)
# Run batch through network.
with variable_scope.variable_scope(name, 'rejection_sample', tensors):
tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
# Reduce the case of a batched example to that of a batch of a single
# example by taking a batch of size one.
if enqueue_many:
# Validate that batch dimension of the input is consistent.
tensor_list = _verify_data_inputs(tensor_list)
# Make a single queue to hold input examples. Reshape output so examples
# don't have singleton batch dimension.
batched = input_ops.batch(tensor_list,
tensor_list = [array_ops.squeeze(x, [0]) for x in batched]
# Set up a queue containing batches that have the distribution.
cur_prob = accept_prob_fn(tensor_list)
if runtime_checks:
cur_prob = array_ops.identity(control_flow_ops.with_dependencies(
[check_ops.assert_less_equal(0.0, cur_prob),
check_ops.assert_less_equal(cur_prob, 1.0)],
cur_prob), name='prob_with_checks')
keep_input = random_ops.random_uniform([]) < cur_prob
return _conditional_batch(
tensor_list, keep_input, batch_size, num_threads=queue_threads)
def stratified_sample_unknown_dist(tensors, labels, probs, batch_size,
enqueue_many=False, queue_capacity=16,
threads_per_queue=1, name=None):
"""Stochastically creates batches based on per-class probabilities.
**NOTICE** This sampler can be significantly slower than `stratified_sample`
due to each thread discarding all examples not in its assigned class.
This uses a number of threads proportional to the number of classes. See
`stratified_sample` for an implementation that discards fewer examples and
uses a fixed number of threads. This function's only advantage over
`stratified_sample` is that the class data-distribution doesn't need to be
known ahead of time.
tensors: List of tensors for data. All tensors are either one item or a
batch, according to enqueue_many.
labels: Tensor for label of data. Label is a single integer or a batch,
depending on enqueue_many. It is not a one-hot vector.
probs: Target class probabilities. An object whose type has a registered
Tensor conversion function.
batch_size: Size of batch to be returned.
enqueue_many: Bool. If true, interpret input tensors as having a batch
queue_capacity: Capacity of each per-class queue.
threads_per_queue: Number of threads for each per-class queue.
name: Optional prefix for ops created by this function.
ValueError: enqueue_many is True and labels doesn't have a batch
dimension, or if enqueue_many is False and labels isn't a scalar.
ValueError: enqueue_many is True, and batch dimension of data and labels
don't match.
ValueError: if probs don't sum to one.
TFAssertion: if labels aren't integers in [0, num classes).
(data_batch, label_batch), where data_batch is a list of tensors of the same
length as `tensors`
# Get tensor for a single data and label example.
data, label = data_provider.Get(['data', 'label'])
# Get stratified batch according to per-class probabilities.
init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
[data_batch], labels = (
[data], label, init_probs, 16))
# Run batch through network.
with ops.name_scope(name, 'stratified_sample_unknown_dist',
tensors + [labels]):
tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
labels = ops.convert_to_tensor(labels)
probs = ops.convert_to_tensor(probs, dtype=dtypes.float32)
# Reduce the case of a single example to that of a batch of size 1.
if not enqueue_many:
tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list]
labels = array_ops.expand_dims(labels, 0)
# Validate that input is consistent.
tensor_list, labels, [probs] = _verify_input(tensor_list, labels, [probs])
# Make per-class queues.
per_class_queues = _make_per_class_queues(
tensor_list, labels, probs.get_shape().num_elements(), queue_capacity,
# Use the per-class queues to generate stratified batches.
return _get_batch_from_per_class_queues(
per_class_queues, probs, batch_size)
def stratified_sample(tensors, labels, target_probs, batch_size,
init_probs=None, enqueue_many=False, queue_capacity=16,
threads_per_queue=1, name=None):
"""Stochastically creates batches based on per-class probabilities.
This method discards examples. Internally, it creates one queue to amortize
the cost of disk reads, and one queue to hold the properly-proportioned
batch. See `stratified_sample_unknown_dist` for a function that performs
stratified sampling with one queue per class and doesn't require knowing the
class data-distribution ahead of time.
tensors: List of tensors for data. All tensors are either one item or a
batch, according to enqueue_many.
labels: Tensor for label of data. Label is a single integer or a batch,
depending on enqueue_many. It is not a one-hot vector.
target_probs: Target class proportions in batch. An object whose type has a
registered Tensor conversion function.
batch_size: Size of batch to be returned.
init_probs: Class proportions in the data. An object whose type has a
registered Tensor conversion function, or `None` for estimating the
initial distribution.
enqueue_many: Bool. If true, interpret input tensors as having a batch
queue_capacity: Capacity of the large queue that holds input examples.
threads_per_queue: Number of threads for the large queue that holds input
examples and for the final queue with the proper class proportions.
name: Optional prefix for ops created by this function.
ValueError: enqueue_many is True and labels doesn't have a batch
dimension, or if enqueue_many is False and labels isn't a scalar.
ValueError: enqueue_many is True, and batch dimension on data and labels
don't match.
ValueError: if probs don't sum to one.
ValueError: if a zero initial probability class has a nonzero target
TFAssertion: if labels aren't integers in [0, num classes).
(data_batch, label_batch), where data_batch is a list of tensors of the same
length as `tensors`
# Get tensor for a single data and label example.
data, label = data_provider.Get(['data', 'label'])
# Get stratified batch according to per-class probabilities.
target_probs = [...distribution you want...]
[data_batch], labels =
[data], label, target_probs)
# Run batch through network.
with ops.name_scope(name, 'stratified_sample', tensors + [labels]):
tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
labels = ops.convert_to_tensor(labels)
target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32)
# Reduce the case of a single example to that of a batch of size 1.
if not enqueue_many:
tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list]
labels = array_ops.expand_dims(labels, 0)
# If `init_probs` is `None`, set up online estimation of data distribution.
if init_probs is None:
# We use `target_probs` to get the number of classes, so its shape must be
# fully defined at graph construction time.
init_probs = _estimate_data_distribution(
labels, target_probs.get_shape().num_elements())
init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
# Validate that input is consistent.
tensor_list, labels, [init_probs, target_probs] = _verify_input(
tensor_list, labels, [init_probs, target_probs])
# Check that all zero initial probabilities also have zero target
# probabilities.
assert_op = control_flow_ops.Assert(
math_ops.not_equal(init_probs, 0),
math_ops.equal(target_probs, 0))),
['All classes with zero initial probability must also have zero target '
'probability: ', init_probs, target_probs])
init_probs = control_flow_ops.with_dependencies([assert_op], init_probs)
# Calculate acceptance sampling probabilities.
accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs)
proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs)
accept_probs = control_flow_ops.cond(
math_ops.less(proportion_rejected, .5),
lambda: accept_probs,
lambda: logging_ops.Print( # pylint: disable=g-long-lambda
accept_probs, [accept_probs],
message='Proportion of examples rejected by sampler is high.',
# Make a single queue to hold input examples. Reshape output so examples
# don't have singleton batch dimension.
batched = input_ops.batch(tensor_list + [labels],
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
gate_gradients, aggregation_method, stop_gradients):
"""Implementation of gradients()."""
if context.executing_eagerly():
raise RuntimeError("tf.gradients not supported when eager execution "
"is enabled. Use tf.contrib.eager.GradientTape "
ys = _AsList(ys)
xs = _AsList(xs)
stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
if grad_ys is None:
grad_ys = [None] * len(ys)
grad_ys = _AsList(grad_ys)
with ops.name_scope(
name, "gradients",
list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
xs = [
x.handle if resource_variable_ops.is_resource_variable(x) else x
for x in xs
xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
xs, name="x", as_ref=True)
grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
# The approach we take here is as follows: Create a list of all ops in the
# subgraph between the ys and xs. Visit these ops in reverse order of ids
# to ensure that when we visit an op the gradients w.r.t its outputs have
# been collected. Then aggregate these gradients if needed, call the op's
# gradient function, and add the generated gradients to the gradients for
# its input.
# Initialize the pending count for ops in the connected subgraph from ys
# to the xs.
if len(ys) > 1:
ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
to_ops = [t.op for t in ys]
from_ops = [t.op for t in xs]
stop_gradient_ops = [t.op for t in stop_gradients]
pending_count, loop_state = _PendingCount(
ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
# Iterate over the collected ops.
# grads: op => list of gradients received on each output endpoint of the
# op. The gradients for each endpoint are initially collected as a list.
# When it is time to call the op's gradient function, for each endpoint we
# aggregate the list of received gradients into a Add() Operation if there
# is more than one.
grads = {}
# Add the initial gradients for the ys.
for y, grad_y in zip(ys, grad_ys):
_SetGrad(grads, y, grad_y)
# Initialize queue with to_ops.
queue = collections.deque()
# Add the ops in 'to_ops' into the queue.
to_ops_set = set()
for op in to_ops:
# 'ready' handles the case where one output gradient relies on
# another output's gradient.
# pylint: disable=protected-access
ready = (pending_count[op._id] == 0)
if ready and op._id not in to_ops_set:
# pylint: enable=protected-access
if loop_state:
loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
for y in loop_exits:
if _IsTrainable(y):
_SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
while queue:
# generate gradient subgraph for op.
op = queue.popleft()
with _maybe_colocate_with(op, colocate_gradients_with_ops):
if loop_state:
loop_state.EnterGradWhileContext(op, before=True)
out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
if loop_state:
loop_state.ExitGradWhileContext(op, before=True)
grad_fn = None
# pylint: disable=protected-access
func_call = None
is_func_call = ops.get_default_graph()._is_function(op.type)
has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
if has_out_grads and (op._id not in stop_ops):
if is_func_call:
func_call = ops.get_default_graph()._get_function(op.type)
grad_fn = func_call.python_grad_func
# pylint: enable=protected-access
def _validate(tensor_list):
tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensor_list)
if not tensor_list:
raise ValueError("Expected at least one tensor in batch().")
return tensor_list
def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
"""Returns the element-wise sum of a list of tensors.
Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
otherwise, these are inferred.
For example:
# tensor 'a' is [[1, 2], [3, 4]]
# tensor `b` is [[5, 0], [0, 6]]
tf.accumulate_n([a, b, a]) ==> [[7, 4], [6, 14]]
# Explicitly pass shape and type
tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
==> [[7, 4], [6, 14]]
inputs: A list of `Tensor` objects, each with same shape and type.
shape: Shape of elements of `inputs`.
tensor_dtype: The type of `inputs`.
name: A name for the operation (optional).
A `Tensor` of same shape and type as the elements of `inputs`.
ValueError: If `inputs` don't all have same shape and dtype or the shape
cannot be inferred.
if tensor_dtype is None:
if not inputs or not isinstance(inputs, (list, tuple)):
raise ValueError("inputs must be a list of at least one Tensor with the "
"same dtype and shape")
inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
if not all(isinstance(x, ops.Tensor) for x in inputs):
raise ValueError("inputs must be a list of at least one Tensor with the "
"same dtype and shape")
if not all(x.dtype == inputs[0].dtype for x in inputs):
raise ValueError("inputs must be a list of at least one Tensor with the "
"same dtype and shape")
tensor_dtype = inputs[0].dtype
if shape is not None:
shape = tensor_shape.as_shape(shape)
shape = tensor_shape.unknown_shape()
for input_tensor in inputs:
if isinstance(input_tensor, ops.Tensor):
shape = shape.merge_with(input_tensor.get_shape())
if not shape.is_fully_defined():
# TODO(pbar): Make a version of assign_add that accepts an uninitialized
# lvalue, and takes its shape from that? This would allow accumulate_n to
# work in all situations that add_n currently works.
raise ValueError("Cannot infer the shape of the accumulator for "
"accumulate_n. Pass the shape argument, or set the shape "
"of at least one of the inputs.")
with ops.op_scope(inputs, name, "AccumulateN") as name:
var = gen_state_ops._temporary_variable(shape=shape, dtype=tensor_dtype)
var_name =
var = state_ops.assign(var, array_ops.zeros_like(inputs[0]))
update_ops = []
for input_tensor in inputs:
op = state_ops.assign_add(var, input_tensor, use_locking=True)
with ops.control_dependencies(update_ops):
return gen_state_ops._destroy_temporary_variable(var,
def gradients(ys,
"""Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
`ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys`
is a list of `Tensor`, holding the gradients received by the
`ys`. The list must be the same length as `ys`.
`gradients()` adds ops to the graph to output the derivatives of `ys` with
respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where
each tensor is the `sum(dy/dx)` for y in `ys`.
`grad_ys` is a list of tensors of the same length as `ys` that holds
the initial gradients for each y in `ys`. When `grad_ys` is None,
we fill in a tensor of '1's of the shape of y for each y in `ys`. A
user can provide their own initial `grad_ys` to compute the
derivatives using a different initial gradient for each y (e.g., if
one wanted to weight the gradient differently for each value in
each y).
`stop_gradients` is a `Tensor` or a list of tensors to be considered constant
with respect to all `xs`. These tensors will not be backpropagated through,
as though they had been explicitly disconnected using `stop_gradient`. Among
other things, this allows computation of partial derivatives as opposed to
total derivatives. For example:
a = tf.constant(0.)
b = 2 * a
g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is
equivalent to:
a = tf.stop_gradient(tf.constant(0.))
b = tf.stop_gradient(2 * a)
g = tf.gradients(a + b, [a, b])
`stop_gradients` provides a way of stopping gradient after the graph has
already been constructed, as compared to `tf.stop_gradient` which is used
during graph construction. When the two approaches are combined,
backpropagation stops at both `tf.stop_gradient` nodes and nodes in
`stop_gradients`, whichever is encountered first.
ys: A `Tensor` or list of tensors to be differentiated.
xs: A `Tensor` or list of tensors to be used for differentiation.
grad_ys: Optional. A `Tensor` or list of tensors the same size as
`ys` and holding the gradients computed for each y in `ys`.
name: Optional name to use for grouping all the gradient ops together.
defaults to 'gradients'.
colocate_gradients_with_ops: If True, try colocating gradients with
the corresponding op.
gate_gradients: If True, add a tuple around the gradients returned
for an operations. This avoids some race conditions.
aggregation_method: Specifies the method used to combine gradient terms.
Accepted values are constants defined in the class `AggregationMethod`.
stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
A list of `sum(dy/dx)` for each x in `xs`.
LookupError: if one of the operations between `x` and `y` does not
have a registered gradient function.
ValueError: if the arguments are invalid.
RuntimeError: if called in Eager mode.
if context.in_eager_mode():
raise RuntimeError("tf.gradients not supported in EAGER mode. Use "
"functions in tf.contrib.eager.backprop instead.")
ys = _AsList(ys)
xs = _AsList(xs)
stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
if grad_ys is None:
grad_ys = [None] * len(ys)
grad_ys = _AsList(grad_ys)
with ops.name_scope(
name, "gradients",
list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
xs = [
x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x
for x in xs
def embedding_lookup(params, ids, name=None):
"""Looks up `ids` in a list of embedding tensors.
This function is used to perform parallel lookups on the list of
tensors in `params`. It is a generalization of
[`tf.gather()`](../../api_docs/python/, where `params` is
interpreted as a partition of a larger embedding tensor.
If `len(params) > 1`, each element `id` of `ids` is partitioned between
the elements of `params` by computing `p = id % len(params)`, and is
then used to look up the slice `params[p][id // len(params), ...]`.
The results of the lookup are then concatenated into a dense
tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
params: A list of tensors with the same shape and type.
ids: A `Tensor` with type `int32` containing the ids to be looked
up in `params`.
name: A name for the operation (optional).
A `Tensor` with the same type as the tensors in `params`.
ValueError: If `params` is empty.
if not isinstance(params, list):
params = [params]
with ops.op_scope(params + [ids], name, "embedding_lookup") as name:
if not params:
raise ValueError("Need at least one param")
np = len(params) # Number of partitions
params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
if np == 1:
with ops.device(params[0].device):
return array_ops.gather(params[0], ids, name=name)
ids = ops.convert_to_tensor(ids, name="ids")
flat_ids = array_ops.reshape(ids, [-1])
original_indices = math_ops.range(0, array_ops.size(flat_ids))
# Compute flat_ids % partitions for each id
ids_mod_p = flat_ids % np
if ids_mod_p.dtype != types.int32:
ids_mod_p = math_ops.cast(ids_mod_p, types.int32)
# Partition single list of ids based on ids % np into np separate lists
plist = data_flow_ops.dynamic_partition(flat_ids, ids_mod_p, np)
# Similarly, partition the original indices.
pindices = data_flow_ops.dynamic_partition(original_indices, ids_mod_p,
# Do np separate lookups, finding embeddings for plist[p] in params[p]
partitioned_result = []
for p in xrange(np):
# TODO(agarwal): handle device allocations here and later in the
# colocate code.
gather_ids = plist[p] // np
with ops.device(params[p].device):
partitioned_result.append(array_ops.gather(params[p], gather_ids))
# Stitch these back together
ret = data_flow_ops.dynamic_stitch(pindices, partitioned_result,
# Reshape to reverse the flattening of ids.
# It's important that we compute params[0].shape on the right device
# to avoid data motion.
with ops.device(params[0].device):
params_shape = array_ops.shape(params[0])
ret = array_ops.reshape(ret, array_ops.concat(0, [
array_ops.shape(ids), array_ops.slice(params_shape, [1], [-1])]))
# output shape = ids.shape + params[*].shape[1:]
# Normally the reshape is sufficient, but setting shape explicitly
# teaches shape inference that params[1:].get_shape() matters.
element_shape = params[0].get_shape()[1:]
for p in params[1:]:
element_shape = element_shape.merge_with(p.get_shape()[1:])
return ret
def _embedding_lookup_and_transform(params,
"""Helper function for embedding_lookup and _compute_sampled_logits.
This function is a generalization of embedding_lookup that optionally
applies a caller-specified transformation to each embedding. This is
done through the `transform_fn` argument. If provided, the function is
applied to each partitioned tensor of retrieved embeddings, colocated
with the embeddings. This function will be called with a single `Tensor`
argument of the same type as the `params` tensor and should return a
`Tensor`. The shape of the argument will be the same as `params` except
for the size of the first dimension. The first dimension of the result's
shape must be the same size as the argument's.
params: See embedding_lookup.
ids: See embedding_lookup.
partition_strategy: See embedding_lookup.
name: See embedding_lookup.
max_norm: See embedding_lookup.
transform_fn: An optional function to apply to each retrieved embedding.
If max_norm is provided, transform_fn is applied to the norm-limited
See embedding_lookup for details.
ValueError: If `params` is empty.
if params is None or params in ((), []):
raise ValueError("Need at least one param")
if isinstance(params, variables.PartitionedVariable):
params = list(params) # Iterate to get the underlying Variables.
if not isinstance(params, list):
params = [params]
with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
np = len(params) # Number of partitions
# Preserve the resource variable status to avoid accidental dense reads.
if not any(
isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
ids = ops.convert_to_tensor(ids, name="ids")
if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
with ops.colocate_with(params[0]):
result = _clip(array_ops.gather(params[0], ids, name=name),
ids, max_norm)
if transform_fn:
result = transform_fn(result)
# Make sure the final result does not have colocation contraints on the
# params. Similar to the case np > 1 where parallel_dynamic_stitch is
# outside the scioe of all with ops.colocate_with(params[p]).
return array_ops.identity(result)
# Flatten the ids. There are two cases where we need to do this.
# - There is more than one params tensor.
# - There is a transform_fn and ids is not statically known to be 1-D.
# We must flatten in this case because transform_fn expects a flat
# tensor of embeddings.
flat_ids = array_ops.reshape(ids, [-1])
original_indices = math_ops.range(array_ops.size(flat_ids))
# Create p_assignments and set new_ids depending on the strategy.
if partition_strategy == "mod":
p_assignments = flat_ids % np
new_ids = flat_ids // np
elif partition_strategy == "div":
# Compute num_total_ids as the sum of dim-0 of params, then assign to
# partitions based on a constant number of ids per partition. Optimize
# if we already know the full shape statically.
dim_0_size = tensor_shape.Dimension(tensor_shape.dimension_value(
for p in xrange(1, np):
dim_0_size += tensor_shape.Dimension(tensor_shape.dimension_value(
if dim_0_size.value:
num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
dim_0_sizes = []
for p in xrange(np):
param_p_dim = tensor_shape.dimension_value(params[p].get_shape()[0])
if param_p_dim is not None:
with ops.colocate_with(params[p]):
num_total_ids = math_ops.reduce_sum(
math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
ids_per_partition = num_total_ids // np
extras = num_total_ids % np
p_assignments = math_ops.maximum(
flat_ids // (ids_per_partition + 1),
(flat_ids - extras) // ids_per_partition)
# Emulate a conditional using a boolean indicator tensor
def gradients(ys, xs, grad_ys=None, name="gradients",
"""Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`.
`ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys`
is a list of `Tensor`, holding the gradients received by the
`ys`. The list must be the same length as `ys`.
`gradients()` adds ops to the graph to output the partial
derivatives of `ys` with respect to `xs`. It returns a list of
`Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
for y in `ys`.
`grad_ys` is a list of tensors of the same length as `ys` that holds
the initial gradients for each y in `ys`. When `grad_ys` is None,
we fill in a tensor of '1's of the shape of y for each y in `ys`. A
user can provide their own initial 'grad_ys` to compute the
derivatives using a different initial gradient for each y (e.g., if
one wanted to weight the gradient differently for each value in
each y).
ys: A `Tensor` or list of tensors to be differentiated.
xs: A `Tensor` or list of tensors to be used for differentiation.
grad_ys: Optional. A `Tensor` or list of tensors the same size as
`ys` and holding the gradients computed for each y in `ys`.
name: Optional name to use for grouping all the gradient ops together.
defaults to 'gradients'.
colocate_gradients_with_ops: If True, try colocating gradients with
the corresponding op.
gate_gradients: If True, add a tuple around the gradients returned
for an operations. This avoids some race conditions.
aggregation_method: Specifies the method used to combine gradient terms.
Accepted values are constants defined in the class `AggregationMethod`.
A list of `sum(dy/dx)` for each x in `xs`.
LookupError: if one of the operations between `x` and `y` does not
have a registered gradient function.
ValueError: if the arguments are invalid.
ys = _AsList(ys)
xs = _AsList(xs)
if grad_ys is None:
grad_ys = [None] * len(ys)
grad_ys = _AsList(grad_ys)
with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
# The approach we take here is as follows: Create a list of all ops in the
# subgraph between the ys and xs. Visit these ops in reverse order of ids
# to ensure that when we visit an op the gradients w.r.t its outputs have
# been collected. Then aggregate these gradients if needed, call the op's
# gradient function, and add the generated gradients to the gradients for
# its input.
# Initialize the pending count for ops in the connected subgraph from ys
# to the xs.
to_ops = [t.op for t in ys]
from_ops = [t.op for t in xs]
pending_count, has_control_flow = _PendingCount(
ops.get_default_graph(), to_ops, from_ops)
# Iterate over the collected ops.
# grads: op => list of gradients received on each output endpoint of the
# op. The gradients for each endpoint are initially collected as a list.
# When it is time to call the op's gradient function, for each endpoint we
# aggregate the list of received gradients into a Add() Operation if there
# is more than one.
grads = {}
# Add the initial gradients for the ys.
for y, grad_y in zip(ys, grad_ys):
_SetGrad(grads, y, grad_y)
# Initialize queue with to_ops.
queue = collections.deque()
# Add the ops in 'to_ops' into the queue.
to_ops_set = set()
for op in to_ops:
if op._id not in to_ops_set:
# The set of 'from_ops'.
stop_ops = _StopOps(from_ops, pending_count)
while queue:
# generate gradient subgraph for op.
op = queue.popleft()
with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)):
if has_control_flow: