本文整理汇总了Python中torch.bmm函数的典型用法代码示例。如果您正苦于以下问题:Python bmm函数的具体用法?Python bmm怎么用?Python bmm使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了bmm函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: forward
def forward(self, inputs):
x, u = inputs
x = self.bn0(x)
x = F.tanh(self.linear1(x))
x = F.tanh(self.linear2(x))
V = self.V(x)
mu = F.tanh(self.mu(x))
Q = None
if u is not None:
num_outputs = mu.size(1)
L = self.L(x).view(-1, num_outputs, num_outputs)
L = L * \
self.tril_mask.expand_as(
L) + torch.exp(L) * self.diag_mask.expand_as(L)
P = torch.bmm(L, L.transpose(2, 1))
u_mu = (u - mu).unsqueeze(2)
A = -0.5 * \
torch.bmm(torch.bmm(u_mu.transpose(2, 1), P), u_mu)[:, :, 0]
Q = A + V
return mu, Q, V
开发者ID:lenvdv,项目名称:pytorch-ddpg-naf,代码行数:25,代码来源:naf.py
示例2: predict
def predict(self, x_de, x_en):
bs = x_de.size(0)
emb_de = self.embedding_de(x_de) # bs,n_de,word_dim
emb_en = self.embedding_en(x_en) # bs,n_en,word_dim
h_enc = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
c_enc = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
h_dec = Variable(torch.zeros(self.n_layers, bs, self.hidden_dim).cuda())
c_dec = Variable(torch.zeros(self.n_layers, bs, self.hidden_dim).cuda())
enc_h, _ = self.encoder(emb_de, (h_enc, c_enc)) # (bs,n_de,hiddensz*2)
dec_h, _ = self.decoder(emb_en, (h_dec, c_dec)) # (bs,n_en,hiddensz)
# all the same. enc_h is bs,n_de,hiddensz*n_directions. h and c are both n_layers*n_directions,bs,hiddensz
if self.directions == 2:
scores = torch.bmm(self.dim_reduce(enc_h), dec_h.transpose(1,2))
else:
scores = torch.bmm(enc_h, dec_h.transpose(1,2))
# (bs,n_de,hiddensz) * (bs,hiddensz,n_en) = (bs,n_de,n_en)
scores[(x_de == pad_token).unsqueeze(2).expand(scores.size())] = -math.inf # binary mask
attn_dist = F.softmax(scores,dim=1) # bs,n_de,n_en
context = torch.bmm(attn_dist.transpose(2,1),enc_h)
# (bs,n_en,n_de) * (bs,n_de,hiddensz*ndirections) = (bs,n_en,hiddensz*ndirections)
pred = self.vocab_layer(torch.cat([dec_h,context],2)) # bs,n_en,len(EN.vocab)
pred = pred[:,:-1,:] # alignment
_, tokens = pred.max(2) # bs,n_en-1
sauce = Variable(torch.cuda.LongTensor([[sos_token]]*bs)) # bs
return torch.cat([sauce,tokens],1), attn_dist
开发者ID:anihamde,项目名称:cs287-s18,代码行数:25,代码来源:visualization_junk_copy_to_aws.py
示例3: forward
def forward(self, feat, right, wrong, batch_wrong, fake=None, fake_diff_mask=None):
num_wrong = wrong.size(1)
batch_size = feat.size(0)
feat = feat.view(-1, self.ninp, 1)
right_dis = torch.bmm(right.view(-1, 1, self.ninp), feat)
wrong_dis = torch.bmm(wrong, feat)
batch_wrong_dis = torch.bmm(batch_wrong, feat)
wrong_score = torch.sum(torch.exp(wrong_dis - right_dis.expand_as(wrong_dis)),1) \
+ torch.sum(torch.exp(batch_wrong_dis - right_dis.expand_as(batch_wrong_dis)),1)
loss_dis = torch.sum(torch.log(wrong_score + 1))
loss_norm = right.norm() + feat.norm() + wrong.norm() + batch_wrong.norm()
if fake:
fake_dis = torch.bmm(fake.view(-1, 1, self.ninp), feat)
fake_score = torch.masked_select(torch.exp(fake_dis - right_dis), fake_diff_mask)
margin_score = F.relu(torch.log(fake_score + 1) - self.margin)
loss_fake = torch.sum(margin_score)
loss_dis += loss_fake
loss_norm += fake.norm()
loss = (loss_dis + 0.1 * loss_norm) / batch_size
if fake:
return loss, loss_fake.data[0] / batch_size
else:
return loss
开发者ID:AashishV,项目名称:visDial.pytorch,代码行数:30,代码来源:model.py
示例4: forward
def forward(self, vocab):
with torch.no_grad():
batch_shape = vocab['sentence'].shape
s_embedding = self.embedding(vocab['sentence'].cuda())
a_embedding = self.embedding(vocab['aspect'].cuda())
packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True)
out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output
out_a, (h_a, c2) = self.lstm_a(a_embedding)
with torch.no_grad():
unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True)
# Pair-wise interaction matrix
I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1))
# Column-wise softmax
a2s_attn = F.softmax(I_matrix, dim=1)
# Row-wise softmax => Column-wise average => aspect attention
s2a_attn = F.softmax(I_matrix, dim=2)
a_attn = torch.mean(s2a_attn, dim=1)
# Final sentence attn => weighted sum of each individual a2s_attn
s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1))
final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1)
pred = self.fc(final_rep)
return pred
开发者ID:bearcave9,项目名称:Weekend-Projects,代码行数:30,代码来源:AOA_LSTM.py
示例5: forward
def forward(self, ht, hs, mask, weighted_ctx=True):
'''
ht: batch x ht_dim
hs: (seq_len x batch x hs_dim, seq_len x batch x ht_dim)
mask: seq_len x batch
'''
hs, hs_ = hs
# seq_len, batch, _ = hs.size()
hs = hs.transpose(0, 1)
hs_ = hs_.transpose(0, 1)
# hs: batch x seq_len x hs_dim
# hs_: batch x seq_len x ht_dim
# hs_ = self.hs2ht(hs)
# Alignment/Attention Function
# batch x ht_dim x 1
ht = ht.unsqueeze(2)
# batch x seq_len
score = torch.bmm(hs_, ht).squeeze(2)
# attn = F.softmax(score, dim=-1)
attn = F.softmax(score, dim=-1) * mask.transpose(0, 1) + EPSILON
attn = attn / attn.sum(-1, keepdim=True)
# Compute weighted sum of hs by attention.
# batch x 1 x seq_len
attn = attn.unsqueeze(1)
if weighted_ctx:
# batch x hs_dim
weight_hs = torch.bmm(attn, hs).squeeze(1)
else:
weight_hs = None
return weight_hs, attn
开发者ID:UriSha,项目名称:sigmorphon,代码行数:32,代码来源:model.py
示例6: forward_dot
def forward_dot(self, hid, ctx, ctx_mask):
r"""Computes Luong-style dot attention probabilities between
decoder's hidden state and source annotations.
Arguments:
hid(Variable): A set of decoder hidden states of shape `T*B*H`
where `T` == 1, `B` is batch dim and `H` is hidden state dim.
ctx(Variable): A set of annotations of shape `S*B*C` where `S`
is the source timestep dim, `B` is batch dim and `C`
is annotation dim.
ctx_mask(FloatTensor): A binary mask of shape `S*B` with zeroes
in the padded timesteps.
Returns:
scores(Variable): A variable of shape `S*B` containing normalized
attention scores for each position and sample.
z_t(Variable): A variable of shape `B*H` containing the final
attended context vector for this target decoding timestep.
"""
# Apply transformations first to make last dims both C and then
# shuffle dims to prepare for batch mat-mult
ctx_ = self.ctx2ctx(ctx).permute(1, 2, 0) # S*B*C -> S*B*C -> B*C*S
hid_ = self.hid2ctx(hid).permute(1, 0, 2) # T*B*H -> T*B*C -> B*T*C
# 'dot' scores of B*T*S
scores = F.softmax(torch.bmm(hid_, ctx_), dim=-1)
# Transform back to hidden_dim for further decoders
# B*T*S x B*S*C -> B*T*C -> B*T*H
z_t = self.ctx2hid(torch.bmm(scores, ctx.transpose(0, 1)))
return scores.transpose(0, 1), z_t.transpose(0, 1)
开发者ID:bardetadrien,项目名称:nmtpytorch,代码行数:32,代码来源:attention.py
示例7: forward
def forward(self, output, context):
batch_size = output.size(0)
hidden_size = output.size(2)
input_size = context.size(1)
# (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len)
attn = torch.bmm(output, context.transpose(1, 2))
mask = torch.eq(attn, 0).data.byte()
attn.data.masked_fill_(mask, -float('inf'))
attn = F.softmax(attn.view(-1, input_size), dim=1).view(batch_size, -1, input_size)
# (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim)
mix = torch.bmm(attn, context)
# concat -> (batch, out_len, 2*dim)
combined = torch.cat((mix, output), dim=2)
# output -> (batch, out_len, dim)
output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)
if not output.is_contiguous():
output = output.contiguous()
return output, attn
开发者ID:shruthi0898,项目名称:Writing-editing-Network,代码行数:25,代码来源:attention.py
示例8: backward
def backward(ctx, grad_output):
batch1, batch2 = ctx.saved_variables
grad_add_matrix = grad_batch1 = grad_batch2 = None
if ctx.needs_input_grad[0]:
grad_add_matrix = maybe_unexpand(grad_output, ctx.add_matrix_size)
if ctx.alpha != 1:
grad_add_matrix = grad_add_matrix.mul(ctx.alpha)
if any(ctx.needs_input_grad[1:]):
batch_grad_output = (grad_output
.unsqueeze(0)
.expand(batch1.size(0), batch1.size(1), batch2.size(2)))
if ctx.needs_input_grad[1]:
grad_batch1 = torch.bmm(batch_grad_output, batch2.transpose(1, 2))
if ctx.beta != 1:
grad_batch1 *= ctx.beta
if ctx.needs_input_grad[2]:
grad_batch2 = torch.bmm(batch1.transpose(1, 2), batch_grad_output)
if ctx.beta != 1:
grad_batch2 *= ctx.beta
return grad_add_matrix, grad_batch1, grad_batch2, None, None, None
开发者ID:Northrend,项目名称:pytorch,代码行数:25,代码来源:blas.py
示例9: forward
def forward(self, q, k, v):
b_q, t_q, dim_q = list(q.size())
b_k, t_k, dim_k = list(k.size())
b_v, t_v, dim_v = list(v.size())
assert(b_q == b_k and b_k == b_v) # batch size should be equal
assert(dim_q == dim_k) # dims should be equal
assert(t_k == t_v) # times should be equal
b = b_q
qk = torch.bmm(q, k.transpose(1, 2)) # b x t_q x t_k
qk.div_(dim_k ** 0.5)
mask = None
if self.causal and t_q > 1:
causal_mask = q.data.new(t_q, t_k).byte().fill_(1).triu_(1)
mask = Variable(causal_mask.unsqueeze(0).expand(b, t_q, t_k),
requires_grad=False)
if self.mask_k is not None:
mask_k = self.mask_k.unsqueeze(1).expand(b, t_q, t_k)
mask = mask_k if mask is None else mask | mask_k
if self.mask_q is not None:
mask_q = self.mask_q.unsqueeze(2).expand(b, t_q, t_k)
mask = mask_q if mask is None else mask | mask_q
if mask is not None:
qk.masked_fill_(mask, -1e9)
sm_qk = F.softmax(qk, dim=2)
sm_qk = self.dropout(sm_qk)
return torch.bmm(sm_qk, v), sm_qk # b x t_q x dim_v
开发者ID:yangkexin,项目名称:seq2seq.pytorch,代码行数:27,代码来源:attention.py
示例10: lstsq
def lstsq(b, y, alpha=0.01):
"""
Batched linear least-squares for pytorch with optional L1 regularization.
Parameters
----------
b : shape(L, M, N)
y : shape(L, M)
Returns
-------
tuple of (coefficients, model, residuals)
"""
bT = b.transpose(-1, -2)
AA = torch.bmm(bT, b)
if alpha != 0:
diag = torch.diagonal(AA, dim1=1, dim2=2)
diag += alpha
RHS = torch.bmm(bT, y[:, :, None])
X, LU = torch.gesv(RHS, AA)
fit = torch.bmm(b, X)[..., 0]
res = y - fit
return X[..., 0], fit, res
开发者ID:Tillsten,项目名称:skultrafast,代码行数:25,代码来源:pytorch_fitter.py
示例11: bnorm
def bnorm(x, U):
mx = torch.bmm(U,x)
subs = x-mx
subs2 = subs*subs
vx = torch.bmm(U,subs2)
out = subs / (vx.clamp(min=1e-10).sqrt() + 1e-5)
return out
开发者ID:ParsonsZeng,项目名称:DiCoNet,代码行数:7,代码来源:model.py
示例12: forward
def forward(self, q, k, v, attn_mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
residual = q
#print('q,k,v:',q.size(),k.size(),v.size())
mb_size, len_q, q_hidden_size = q.size()
mb_size, len_k, k_hidden_size = k.size()
mb_size, len_v, v_hidden_size = v.size()
# treat as a (n_head) size batch
q_s = q.repeat(n_head, 1, 1).view(n_head, -1, q_hidden_size) # n_head x (mb_size*len_q) x d_model
k_s = k.repeat(n_head, 1, 1).view(n_head, -1, k_hidden_size) # n_head x (mb_size*len_k) x d_model
v_s = v.repeat(n_head, 1, 1).view(n_head, -1, v_hidden_size) # n_head x (mb_size*len_v) x d_model
#print('q_s,k_s,v_s:',q_s.size(),k_s.size(),v_s.size())
#print('w_qs',self.w_qs.size())
# treat the result as a (n_head * mb_size) size batch
q_s = torch.bmm(q_s, self.w_qs).view(-1, len_q, d_k) # (n_head*mb_size) x len_q x d_k
k_s = torch.bmm(k_s, self.w_ks).view(-1, len_k, d_k) # (n_head*mb_size) x len_k x d_k
v_s = torch.bmm(v_s, self.w_vs).view(-1, len_v, d_v) # (n_head*mb_size) x len_v x d_v
# perform attention, result size = (n_head * mb_size) x len_q x d_v
#print('attn_mask:',attn_mask.size())
#print(attn_mask)
outputs, attns = self.attention.forward(q_s, k_s, v_s, attn_mask=attn_mask.repeat(n_head,1,1))
# back to original mb_size batch, result size = mb_size x len_q x (n_head*d_v)
outputs = torch.cat(torch.split(outputs, mb_size, dim=0), dim=-1)
# project back to residual size
outputs = self.proj.forward(outputs)
outputs = self.dropout(outputs)
return self.layer_norm(outputs + residual), attns
开发者ID:chickenbestlover,项目名称:DrQA-RN,代码行数:35,代码来源:SubLayers.py
示例13: predict2
def predict2(self, x_de, beamsz, gen_len):
emb_de = self.embedding_de(x_de) # "batch size",n_de,word_dim, but "batch size" is 1 in this case!
h0 = Variable(torch.zeros(self.n_layers*self.directions, 1, self.hidden_dim).cuda())
c0 = Variable(torch.zeros(self.n_layers*self.directions, 1, self.hidden_dim).cuda())
enc_h, _ = self.encoder(emb_de, (h0, c0))
# since enc batch size=1, enc_h is 1,n_de,hiddensz*n_directions
if self.directions == 2:
enc_h = self.dim_reduce(enc_h) # 1,n_de,hiddensz
masterheap = CandList(self.n_layers,self.hidden_dim,enc_h.size(1),beamsz)
# in the following loop, beamsz is length 1 for first iteration, length true beamsz (100) afterward
for i in range(gen_len):
prev = masterheap.get_prev() # beamsz
emb_t = self.embedding_en(prev) # embed the last thing we generated. beamsz,word_dim
enc_h_expand = enc_h.expand(prev.size(0),-1,-1) # beamsz,n_de,hiddensz
h, c = masterheap.get_hiddens() # (n_layers,beamsz,hiddensz),(n_layers,beamsz,hiddensz)
dec_h, (h, c) = self.decoder(emb_t.unsqueeze(1), (h, c)) # dec_h is beamsz,1,hiddensz (batch_first=True)
scores = torch.bmm(enc_h_expand, dec_h.transpose(1,2)).squeeze(2)
# (beamsz,n_de,hiddensz) * (beamsz,hiddensz,1) = (beamsz,n_de,1). squeeze to beamsz,n_de
attn_dist = F.softmax(scores,dim=1)
if self.attn_type == "hard":
_, argmax = attn_dist.max(1) # beamsz for each batch, select most likely german word to pay attention to
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1).cuda())
context = torch.bmm(one_hot.unsqueeze(1), enc_h_expand).squeeze(1)
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h_expand).squeeze(1)
# the difference btwn hard and soft is just whether we use a one_hot or a distribution
# context is beamsz,hiddensz*n_directions
pred = self.vocab_layer(torch.cat([dec_h.squeeze(1), context], 1)) # beamsz,len(EN.vocab)
# TODO: set the columns corresponding to <pad>,<unk>,</s>,etc to 0
masterheap.update_beam(pred)
masterheap.update_hiddens(h,c)
masterheap.update_attentions(attn_dist)
masterheap.firstloop = False
return masterheap.probs,masterheap.wordlist,masterheap.attentions
开发者ID:anihamde,项目名称:cs287-s18,代码行数:35,代码来源:models_original.py
示例14: predict
def predict(self, x_de, x_en):
bs = x_de.size(0)
emb_de = self.embedding_de(x_de) # bs,n_de,word_dim
emb_en = self.embedding_en(x_en) # bs,n_en,word_dim
h = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
c = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
enc_h, _ = self.encoder(emb_de, (h, c))
dec_h, _ = self.decoder(emb_en, (h, c))
# all the same. enc_h is bs,n_de,hiddensz*n_directions. h and c are both n_layers*n_directions,bs,hiddensz
if self.directions == 2:
enc_h = self.dim_reduce(enc_h) # bs,n_de,hiddensz
scores = torch.bmm(enc_h, dec_h.transpose(1,2))
# (bs,n_de,hiddensz) * (bs,hiddensz,n_en) = (bs,n_de,n_en)
y = [Variable(torch.cuda.LongTensor([sos_token]*bs))] # bs
self.attn = []
for t in range(x_en.size(1)-1): # iterate over english words, with teacher forcing
attn_dist = F.softmax(scores[:,:,t],dim=1) # bs,n_de
self.attn.append(attn_dist.data)
if self.attn_type == "hard":
_, argmax = attn_dist.max(1) # bs. for each batch, select most likely german word to pay attention to
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1).cuda())
context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1)
# the difference btwn hard and soft is just whether we use a one_hot or a distribution
# context is bs,hiddensz
pred = self.vocab_layer(torch.cat([dec_h[:,t,:], context], 1)) # bs,len(EN.vocab)
_, next_token = pred.max(1) # bs
y.append(next_token)
self.attn = torch.stack(self.attn, 0).transpose(0, 1) # bs,n_en,n_de (for visualization!)
y = torch.stack(y,0).transpose(0,1) # bs,n_en
return y,self.attn
开发者ID:anihamde,项目名称:cs287-s18,代码行数:32,代码来源:models_original.py
示例15: predict
def predict(self, x, attn_type = "hard"):
#predict with greedy decoding
emb = self.embedding(x)
h = Variable(torch.zeros(1, x.size(0), self.hidden_dim))
c = Variable(torch.zeros(1, x.size(0), self.hidden_dim))
enc_h, _ = self.encoder(emb, (h, c))
y = [Variable(torch.zeros(x.size(0)).long())]
self.attn = []
for t in range(x.size(1)):
emb_t = self.embedding(y[-1])
dec_h, (h, c) = self.decoder(emb_t.unsqueeze(1), (h, c))
scores = torch.bmm(enc_h, dec_h.transpose(1,2)).squeeze(2)
attn_dist = F.softmax(scores, dim = 1)
self.attn.append(attn_dist.data)
if attn_type == "hard":
_, argmax = attn_dist.max(1)
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1))
context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1)
pred = self.vocab_layer(torch.cat([dec_h.squeeze(1), context], 1))
_, next_token = pred.max(1)
y.append(next_token)
self.attn = torch.stack(self.attn, 0).transpose(0, 1)
return torch.stack(y, 0).transpose(0, 1)
开发者ID:anihamde,项目名称:cs287-s18,代码行数:25,代码来源:section4-Copy1.py
示例16: forward
def forward(self, agent_qs, states):
"""Forward pass for the mixer.
Arguments:
agent_qs: Tensor of shape [B, T, n_agents, n_actions]
states: Tensor of shape [B, T, state_dim]
"""
bs = agent_qs.size(0)
states = states.reshape(-1, self.state_dim)
agent_qs = agent_qs.view(-1, 1, self.n_agents)
# First layer
w1 = th.abs(self.hyper_w_1(states))
b1 = self.hyper_b_1(states)
w1 = w1.view(-1, self.n_agents, self.embed_dim)
b1 = b1.view(-1, 1, self.embed_dim)
hidden = F.elu(th.bmm(agent_qs, w1) + b1)
# Second layer
w_final = th.abs(self.hyper_w_final(states))
w_final = w_final.view(-1, self.embed_dim, 1)
# State-dependent bias
v = self.V(states).view(-1, 1, 1)
# Compute final output
y = th.bmm(hidden, w_final) + v
# Reshape and return
q_tot = y.view(bs, -1, 1)
return q_tot
开发者ID:jamescasbon,项目名称:ray,代码行数:26,代码来源:mixers.py
示例17: forward
def forward(self, x_de, x_en, update_baseline=True):
bs = x_de.size(0)
# x_de is bs,n_de. x_en is bs,n_en
emb_de = self.embedding_de(x_de) # bs,n_de,word_dim
emb_en = self.embedding_en(x_en) # bs,n_en,word_dim
h0_enc = torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda()
c0_enc = torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda()
h0_dec = torch.zeros(self.n_layers, bs, self.hidden_dim).cuda()
c0_dec = torch.zeros(self.n_layers, bs, self.hidden_dim).cuda()
# hidden vars have dimension n_layers*n_directions,bs,hiddensz
enc_h, _ = self.encoder(emb_de, (Variable(h0_enc), Variable(c0_enc)))
# enc_h is bs,n_de,hiddensz*n_directions. ordering is different from last week because batch_first=True
dec_h, _ = self.decoder(emb_en, (Variable(h0_dec), Variable(c0_dec)))
# dec_h is bs,n_en,hidden_size*n_directions
# we've gotten our encoder/decoder hidden states so we are ready to do attention
# first let's get all our scores, which we can do easily since we are using dot-prod attention
if self.directions == 2:
scores = torch.bmm(self.dim_reduce(enc_h), dec_h.transpose(1,2))
# TODO: any easier ways to reduce dimension?
else:
scores = torch.bmm(enc_h, dec_h.transpose(1,2))
# (bs,n_de,hiddensz*n_directions) * (bs,hiddensz*n_directions,n_en) = (bs,n_de,n_en)
reinforce_loss = 0 # we only use this variable for hard attention
loss = 0
avg_reward = 0
# we just iterate to dec_h.size(1)-1, since there's </s> at the end of each sentence
for t in range(dec_h.size(1)-1): # iterate over english words, with teacher forcing
attn_dist = F.softmax(scores[:, :, t],dim=1) # bs,n_de. these are the alphas (attention scores for each german word)
if self.attn_type == "hard":
cat = torch.distributions.Categorical(attn_dist)
attn_samples = cat.sample() # bs. each element is a sample from categorical distribution
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, attn_samples.data.unsqueeze(1), 1).cuda()) # bs,n_de
# made a bunch of one-hot vectors
context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)
# now we use the one-hot vectors to select correct hidden vectors from enc_h
# (bs,1,n_de) * (bs,n_de,hiddensz*n_directions) = (bs,1,hiddensz*n_directions). squeeze to bs,hiddensz*n_directions
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1) # same dimensions
# (bs,1,n_de) * (bs,n_de,hiddensz*n_directions) = (bs,1,hiddensz*n_directions)
# context is bs,hidden_size*n_directions
# the rnn output and the context together make the decoder "hidden state", which is bs,2*hidden_size*n_directions
pred = self.vocab_layer(torch.cat([dec_h[:,t,:], context], 1)) # bs,len(EN.vocab)
y = x_en[:, t+1] # bs. these are our labels
no_pad = (y != pad_token) # exclude english padding tokens
reward = torch.gather(pred, 1, y.unsqueeze(1)) # bs,1
# reward[i,1] = pred[i,y[i]]. this gets log prob of correct word for each batch. similar to -crossentropy
reward = reward.squeeze(1)[no_pad] # less than bs
if self.attn_type == "hard":
reinforce_loss -= (cat.log_prob(attn_samples[no_pad]) * (reward-self.baseline).detach()).sum()
# reinforce rule (just read the formula), with special baseline
loss -= reward.sum() # minimizing loss is maximizing reward
no_pad_total = (x_en[:,1:] != pad_token).data.sum() # TODO: i think this is right, right?
loss /= no_pad_total
reinforce_loss /= no_pad_total
avg_reward = -loss.data[0]
if update_baseline: # update baseline as a moving average
self.baseline = Variable(0.95*self.baseline.data + 0.05*avg_reward)
return loss, reinforce_loss,avg_reward
开发者ID:anihamde,项目名称:cs287-s18,代码行数:58,代码来源:models_original.py
示例18: forward
def forward(self, match_encoders):
'''
match_encoders (pn_steps, batch, hidden_size*2)
'''
vh_matrix = self.vh_net(match_encoders) # pn_steps, batch, hidden_size
# prediction start
h0 = Variable(torch.zeros(match_encoders.size(1), self.hidden_size)).cuda()
c0 = Variable(torch.zeros(match_encoders.size(1), self.hidden_size)).cuda()
wha1 = self.wa_net(h0) # bacth, hidden_size
wha1 = wha1.expand(match_encoders.size(0), wha1.size(0), wha1.size(1)) # pn_steps, batch, hidden_size
#print ('_sum.size() ', _sum.size())
#print ('vh_matrix.size() ', vh_matrix.size())
f1 = self.tanh(vh_matrix + wha1) # pn_steps, batch, hidden_size
#print ('f1.size() ', f1.size())
vf1 = self.v_net(f1.transpose(0, 1)).squeeze(-1) #batch, pn_steps
beta1 = self.softmax(vf1) #batch, pn_steps
softmax_beta1 = self.softmax(beta1).view(beta1.size(0), 1, beta1.size(1)) #batch, 1, pn_steps
inp = torch.bmm(softmax_beta1, match_encoders.transpose(0, 1)) # bacth, 1, hidden_size
inp = inp.squeeze(1) # bacth, hidden_size
h1, c1 = self.pointer_lstm(inp, (h0, c0))
wha2 = self.wa_net(h1) # bacth, hidden_size
wha2 = wha2.expand(match_encoders.size(0), wha2.size(0), wha2.size(1)) # pn_steps, batch, hidden_size
f2 = self.tanh(vh_matrix + wha2) # pn_steps, batch, hidden_size
vf2 = self.v_net(f2.transpose(0, 1)).squeeze(-1) #batch, pn_steps
beta2 = self.softmax(vf2)#batch, pn_steps
softmax_beta2 = self.softmax(beta2).view(beta2.size(0), 1, beta2.size(1)) #batch, 1, pn_steps
inp = torch.bmm(softmax_beta2, match_encoders.transpose(0, 1)) # bacth, 1, hidden_size
inp = inp.squeeze(1) # bacth, hidden_size
h2, c2 = self.pointer_lstm(inp, (h1, c1))
_, start = torch.max(beta1, 1)
_, end = torch.max(beta2, 1)
beta1 = beta1.view(1, beta1.size(0), beta1.size(1))
beta2 = beta2.view(1, beta2.size(0), beta2.size(1))
logits = torch.cat([beta1, beta2])
start = start.view(1, start.size(0))
end = end.view(1, end.size(0))
prediction = torch.cat([start, end]).transpose(0, 1).cpu().data.numpy()
return logits, prediction
开发者ID:xuwenshen,项目名称:Reading_Comprehension,代码行数:56,代码来源:linear_match_lstm.py
示例19: forward
def forward(self, query_embeddings, in_memory_embeddings, out_memory_embeddings, attention_mask=None):
attention = torch.bmm(in_memory_embeddings, query_embeddings.unsqueeze(2)).squeeze(2)
if attention_mask is not None:
# exclude masked elements from the softmax
attention = attention_mask.float() * attention + (1 - attention_mask.float()) * -1e20
probs = softmax(attention).unsqueeze(1)
memory_output = torch.bmm(probs, out_memory_embeddings).squeeze(1)
query_embeddings = self.linear(query_embeddings)
output = memory_output + query_embeddings
return output
开发者ID:jojonki,项目名称:ParlAI,代码行数:10,代码来源:modules.py
示例20: updateGradInput
def updateGradInput(self, input, gradOutput):
M, v = input
self.gradInput[0].resize_as_(M)
self.gradInput[1].resize_as_(v)
gradOutput = gradOutput.contiguous()
assert gradOutput.ndimension() == 1 or gradOutput.ndimension() == 2
if gradOutput.ndimension() == 2:
assert M.ndimension() == 3
assert v.ndimension() == 2
bdim = M.size(0)
odim = M.size(1)
idim = M.size(2)
if self.trans:
torch.bmm(v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim), out=self.gradInput[0])
torch.bmm(M, gradOutput.view(bdim, idim, 1), out=self.gradInput[1].view(bdim, odim, 1))
else:
torch.bmm(gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim), out=self.gradInput[0])
torch.bmm(M.transpose(1, 2), gradOutput.view(bdim, odim, 1), out=self.gradInput[1].view(bdim, idim, 1))
else:
assert M.ndimension() == 2
assert v.ndimension() == 1
if self.trans:
torch.ger(v, gradOutput, out=self.gradInput[0])
self.gradInput[1] = M * gradOutput
else:
torch.ger(gradOutput, v, out=self.gradInput[0])
self.gradInput[1] = M.t() * gradOutput
return self.gradInput
开发者ID:Jsmilemsj,项目名称:pytorch,代码行数:33,代码来源:MV.py
注:本文中的torch.bmm函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论