本文整理汇总了Python中pycuda.driver.from_device函数的典型用法代码示例。如果您正苦于以下问题:Python from_device函数的具体用法?Python from_device怎么用?Python from_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了from_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_phir_gpu
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):
REAL = par_reac.REAL
Nq = len(field.xq)
N = len(XK)
MV = numpy.zeros(len(XK))
L = numpy.sqrt(2*surface.Area) # Representative length
AI_int = 0
# Setup vector
K = par_reac.K
tic = time.time()
w = getWeights(K)
X_V = numpy.zeros(N*K)
X_Kx = numpy.zeros(N*K)
X_Ky = numpy.zeros(N*K)
X_Kz = numpy.zeros(N*K)
X_Kc = numpy.zeros(N*K)
X_Vc = numpy.zeros(N*K)
for i in range(N*K):
X_V[i] = XV[i/K]*w[i%K]*surface.Area[i/K]
X_Kx[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
X_Ky[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
X_Kz[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
X_Kc[i] = XK[i/K]
X_Vc[i] = XV[i/K]
toc = time.time()
time_set = toc - tic
sort = surface.sortSource
phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
m_gpu = cuda.to_device(X_V[sort].astype(REAL))
mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL))
my_gpu = cuda.to_device(X_Ky[sort].astype(REAL))
mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL))
mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
xkDev = cuda.to_device(surface.xk.astype(REAL))
wkDev = cuda.to_device(surface.wk.astype(REAL))
get_phir = kernel.get_function("get_phir")
GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))
get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu,
surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev,
numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))
AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
AI_int = numpy.sum(AI_aux)
phir_cpu = numpy.zeros(Nq, dtype=REAL)
phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)
return phir_cpu, AI_int
开发者ID:cdcooper84,项目名称:pygbe,代码行数:59,代码来源:projection.py
示例2: send
def send(target, nx, ny, nz, fx_gpu, fy_gpu):
if target < myrank:
offset_fx = int(fx_gpu)
offset_fy = int(fy_gpu)
else:
offset_fx = int(fx_gpu) + nx*ny*(nz-1)*nof
offset_fy = int(fy_gpu) + nx*ny*(nz-1)*nof
mpi.world.send(target, 0, cuda.from_device(offset_fx, (nx,ny), np.float32))
mpi.world.send(target, 1, cuda.from_device(offset_fy, (nx,ny), np.float32))
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:10,代码来源:035-block1d-texture-smem-if-over_maxgrid-mpi-cpu.py
示例3: exchange
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
ctx1 = cuda.Device(dev1).make_context()
a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
ctx1.pop()
ctx2 = cuda.Device(dev2).make_context()
cuda.memcpy_htod(int(b_gpu), a)
b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
ctx2.pop()
ctx1 = cuda.Device(dev1).make_context()
cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
ctx1.pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:13,代码来源:03-3GPU.py
示例4: train_gpu
def train_gpu(self, num_iter, model_file_path):
if self.batch == 0:
# Prepare to send the numpy array to gpu
self.syn1_gpu = cuda.to_device(self.syn1)
# Create word idx and related data-structure.
self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
word_rep_ptr = int(self.base_word_rep)
self.word_reps = {}
for w_idx, word in sorted(self.dictionary.items()):
word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
word_point = self.words_rep[word][1].astype(dtype=np.int32)
self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
word_rep_ptr += WordRep.memsize
print "GPU transfers done."
self.sent_reps_gpu = cuda.to_device(self.sent_reps)
# Prepare sentences for GPU transfer.
idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
for sentence in self.sentences]
# Prepare the kernel function
kernel = self.kernel_str.get_function("train_sg")
words = np.empty(self.num_sents, dtype=np.int32)
# sent_reps = np.copy(self.sent_reps)
for iter in range(num_iter):
# Sample words for each sentence and transfer to GPU
for s_idx in range(self.num_sents):
words[s_idx] = random.choice(idx_sentences[s_idx])
words_gpu = cuda.to_device(words)
kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
# autoinit.context.synchronize()
self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
pickle_dump(self.sent_reps, model_file_path)
开发者ID:ustbliubo2014,项目名称:DeepLearn,代码行数:35,代码来源:paragraph_vector.py
示例5: send
def send(s, rank, tag_mark, direction):
if direction == "f":
offset_gpu = int(s.arr_gpu) + s.ny * nof
elif direction == "b":
offset_gpu = int(s.arr_gpu) + (s.nx - 2) * s.ny * nof
print type(offset_gpu)
comm.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny,), s.dtype))
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:7,代码来源:50-3GPU-mpi4py.py
示例6: test_stub
def test_stub(shift, trials=10, rounds=1):
# Run once so that evt_a doesn't include initialization time
sorter.multisort(dout_a, dout_b, dkeys, count, shift,
rounds, stream=stream)
evt_a = cuda.Event().record(stream)
for i in range(trials):
buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift,
rounds, stream=stream)
evt_b = cuda.Event().record(stream)
evt_b.synchronize()
dur = evt_b.time_since(evt_a) / (rounds * trials)
print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000),
count * sorter.radix_bits / (dur * 32 * 1000))
if shift == 0 and correctness:
print '\nTesting correctness'
out = cuda.from_device(buf, (count,), np.uint32)
sort = np.sort(keys)
if np.all(out == sort):
print 'Correct'
else:
nz = np.nonzero(out != sort)[0]
print sorted(set(nz >> 13))
for i in nz:
print i, out[i-1:i+2], sort[i-1:i+2]
assert False, 'Oh no'
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:26,代码来源:sort.py
示例7: get_from_device
def get_from_device(self, index_list=None):
'''
Copy array data from GPU device and wrap in a numpy arrays.
If index_list is None, return list of numpy arrays (one/array).
If index_list is a single integer, return single numpy array.
If index_list is an iterable, list of numpy arrays
(one/selected array).
'''
single = False
if index_list is None:
index_list = range(len(self.data))
else:
try:
int(index_list)
index_list = [index_list]
single = True
except TypeError:
pass
results = []
try:
for i in index_list:
results.append(cuda.from_device(self.data[i], self.shapes[i],
self.dtypes[i]))
except cuda.LaunchError:
import traceback
traceback.print_exc()
traceback.print_stack()
raise ValueError, 'Invalid device pointer: %d' % i
if single:
return results[0]
else:
return results
开发者ID:cfobel,项目名称:pycuda_helpers,代码行数:33,代码来源:cuda.py
示例8: evaluate
def evaluate(self, params, returnOutputs=False):
"""Evaluate several networks (with given params) on training set.
@param params: network params
@type params: list of Parameters
@param returnOutputs: return network output values (debug)
@type returnOutputs: bool, default False
@return output matrix if returnOutputs=True, else None
"""
if self.popSize != len(params):
raise ValueError("Need %d Parameter structures (provided %d)" % (
self.popSize, len(params)))
paramArrayType = Parameters * len(params)
driver.memcpy_htod(self.params, paramArrayType(*params))
# TODO: remove
driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
self.evaluateKernel.prepared_call(self.evaluateGridDim,
self.trainSetDev,
self.trainSet.size,
self.params,
self.popSize,
self.outputs)
driver.Context.synchronize()
self.outputsMat = driver.from_device(self.outputs,
shape=(self.popSize, self.trainSet.size),
dtype=np.float32)
if returnOutputs:
return self.outputsMat
开发者ID:cpatulea,项目名称:evolution,代码行数:35,代码来源:ann.py
示例9: print_arr_gpus
def print_arr_gpus(s):
s.send_result()
if mpi.rank == 0:
result = cuda.from_device(s.arr_gpu,s.shape,s.dtype)
for i in range(1,ngpu):
result = np.concatenate((result,mpi.world.recv(i,10)))
for i in xrange(s.ny):
print result[:s.nx,i],'\t',result[s.nx:2*s.nx,i],'\t',result[2*s.nx:,i]
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:8,代码来源:20-3GPU-mpi-non_blocking.py
示例10: print_arr_gpus
def print_arr_gpus(ngpu, nx, ny, a_gpu):
send_result(nx, ny, a_gpu)
if mpi.rank == 0:
result = cuda.from_device(a_gpu, (nx,ny), 'float32')
print ngpu
for i in range(1,ngpu):
result = np.concatenate((result, mpi.world.recv(i,10)))
for i in xrange(ny):
print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:9,代码来源:010-mpi-exchange.py
示例11: P2P_gpu
def P2P_gpu(surfSrc, surfTar, m, mx, my, mz, mKc, mVc, K_gpu, V_gpu,
surf, LorY, K_diag, IorE, L, w, param, timing, kernel):
tic = cuda.Event()
toc = cuda.Event()
tic.record()
REAL = param.REAL
mDev = cuda.to_device(m.astype(REAL))
mxDev = cuda.to_device(mx.astype(REAL))
myDev = cuda.to_device(my.astype(REAL))
mzDev = cuda.to_device(mz.astype(REAL))
mKcDev = cuda.to_device(mKc.astype(REAL))
mVcDev = cuda.to_device(mVc.astype(REAL))
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
tic.record()
GSZ = int(ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
direct_gpu = kernel.get_function("P2P")
AI_int = cuda.to_device(zeros(param.Nround, dtype=int32))
# GPU arrays are flattened, need to point to first element
ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays
ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays
# Check if internal or external to send correct singular integral
if IorE==1:
sglInt = surfSrc.sglInt_intDev
else:
sglInt = surfSrc.sglInt_extDev
direct_gpu(K_gpu, V_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mxDev, myDev, mzDev,
mKcDev, mVcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, sglInt,
surfSrc.vertexDev, int32(ptr_offset), int32(ptr_list),
int32(LorY), REAL(param.kappa), REAL(param.threshold),
int32(param.BlocksPerTwig), int32(param.NCRIT), REAL(K_diag), AI_int,
surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))
toc.record()
toc.synchronize()
timing.time_P2P += tic.time_till(toc)*1e-3
tic.record()
AI_aux = zeros(param.Nround, dtype=int32)
AI_aux = cuda.from_device(AI_int, param.Nround, dtype=int32)
timing.AI_int += sum(AI_aux[surfTar.unsort])
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
return K_gpu, V_gpu
开发者ID:LEONOB2014,项目名称:pygbe,代码行数:57,代码来源:FMMutils.py
示例12: get_heartbeat
def get_heartbeat(d_lead, length, sampling_rate):
# Kernel Parameters
threads_per_block = 200
num_blocks = length / threads_per_block
# Get RR
reduce_by = 32
edge_signal = cuda.mem_alloc(4 * length)
edge_detect(edge_signal, d_lead,
grid=(num_blocks, 1), block=(threads_per_block, 1, 1))
indecies = numpy.zeros(length / reduce_by).astype(numpy.int32)
masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32))
d_index = cuda.to_device(indecies)
index_of_peak(d_index, masks, edge_signal,
grid=(num_blocks, 1), block=(threads_per_block, 1, 1))
cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by)
# Allocate output
# full_rr_signal = numpy.zeros(c_length).astype(numpy.int32)
dev_rr = cuda.mem_alloc(c_length * 4)
num_blocks = (c_length / threads_per_block) + 1
get_compact_rr(dev_rr,
cd_index,
numpy.int32(sampling_rate),
numpy.int32(c_length),
grid=(num_blocks, 1), block=(threads_per_block, 1, 1))
clean_result(dev_rr, numpy.int32(120), numpy.int32(40),
numpy.int32(1), numpy.int32(c_length),
grid=(num_blocks, 1), block=(threads_per_block, 1, 1))
moving_average_filter(dev_rr, c_length, 250)
index = cuda.from_device(cd_index, (c_length,), numpy.int32)
rr = cuda.from_device(dev_rr, (c_length,), numpy.int32)
index[0] = index[1]
return rr, index / float(sampling_rate * 3600)
开发者ID:cconklin,项目名称:ECE207Proj2,代码行数:43,代码来源:plotecg.py
示例13: lift
def lift(self, n):
"""Returns (positive rate within n largest) / (overall positive rate) for
each individual.
@return list of counts, in order of individuals
"""
self.countKernel.prepared_call(self.countGridDim,
self.outputs,
self.trainSet.size,
len(self.trainSet.positives),
self.popSize,
self.thresholds,
self.counts)
driver.Context.synchronize()
countsMat = driver.from_device(self.counts,
shape=(self.popSize, self.countBlockDim[0]),
dtype=np.uint32)
#log.debug("counts %r: %s", countsMat.shape, str(countsMat))
log.debug("count sum over threads: %s", str(countsMat.sum(axis=1)))
self.countSums = countsMat.sum(axis=1)
self.nlargestPositiveRate = np.float32(self.countSums) / n
log.debug("positive rate (n largest outputs): %s", str(self.nlargestPositiveRate))
overallPositiveRate = float(len(self.trainSet.positives)) / float(self.trainSet.size)
log.debug("positive rate (overall): %.04f", overallPositiveRate)
lifts = self.nlargestPositiveRate / overallPositiveRate
sortedLifts = sorted(enumerate(lifts), key=lambda (i, l): l, reverse=True)
topIndex, topLift = sortedLifts[0]
topOutputs = self.outputsMat[topIndex]
nans = np.sum(np.isnan(topOutputs))
neginfs = np.sum(np.isneginf(topOutputs))
posinfs = np.sum(np.isposinf(topOutputs))
omin = np.nanmin(topOutputs)
omax = np.nanmax(topOutputs)
threshold = self.thresholdsMat[topIndex]
"""
log.info("The top ANN's outputs are:")
log.info(
" %.02f%% NaN, %.02f%% -inf, %.02f%% +inf, min %.02e, max %.02e, thresh %.02e",
100.0 * nans / len(topOutputs),
100.0 * neginfs / len(topOutputs),
100.0 * posinfs / len(topOutputs),
omin, omax, threshold)
"""
return lifts
开发者ID:cpatulea,项目名称:evolution,代码行数:55,代码来源:ann2layer.py
示例14: nlargest_cpu
def nlargest_cpu(ann, n):
"""CPU implementation of nlargest."""
outputs = driver.from_device(ann.outputs,
shape=(ann.popSize, ann.trainSize),
dtype=np.float32)
thresholds = []
for row in outputs:
sortedRow = sorted(row, reverse=True)
thresholds.append(sortedRow[n])
return thresholds
开发者ID:cpatulea,项目名称:evolution,代码行数:12,代码来源:bench_nlargest.py
示例15: P2PKt_gpu
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu,
surf, LorY, w, param, timing, kernel):
if param.GPU==1:
tic = cuda.Event()
toc = cuda.Event()
else:
tic = Event()
toc = Event()
tic.record()
REAL = param.REAL
mDev = cuda.to_device(m.astype(REAL))
mKtcDev = cuda.to_device(mKtc.astype(REAL))
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
tic.record()
GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
directKt_gpu = kernel.get_function("P2PKt")
AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))
# GPU arrays are flattened, need to point to first element
ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays
ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays
directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu,
surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev,
surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev,
surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list),
numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int,
surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))
toc.record()
toc.synchronize()
timing.time_P2P += tic.time_till(toc)*1e-3
tic.record()
AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
timing.AI_int += sum(AI_aux[surfTar.unsort])
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
return Ktx_gpu, Kty_gpu, Ktz_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:52,代码来源:FMMutils.py
示例16: go_sort_old
def go_sort_old(count, stream=None):
data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
ddata = cuda.to_device(data)
print 'Done seeding'
grids = count / 8192
pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
dpfxs = cuda.to_device(pfxs)
launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs,
block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)
#dsplit = cuda.to_device(pfxs)
#launch('crappy_split', dpfxs, dsplit,
#block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)
dsplit = cuda.mem_alloc(grids * 256 * 4)
launch('better_split', dsplit, dpfxs,
block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
#if not stream:
#split = cuda.from_device_like(dsplit, pfxs)
#split_ = cuda.from_device_like(dsplit_, pfxs)
#print np.all(split == split_)
dshortseg_pfxs = cuda.mem_alloc(256 * 4)
dshortseg_sums = cuda.mem_alloc(256 * 4)
launch('prefix_sum', dpfxs, np.int32(grids * 256),
dshortseg_pfxs, dshortseg_sums,
block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1)
dsorted = cuda.mem_alloc(count * 4)
launch('sort_8', ddata, dsorted, dpfxs,
block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)
launch('sort_8_a', ddata, dsorted, dpfxs, dsplit,
block=(32, 32, 1), grid=(grids, 1), stream=stream)
if not stream:
sorted = cuda.from_device(dsorted, (count,), np.int32)
f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
sort_stat = f(rle(sorted))
with open('dev.txt', 'w') as fp: fp.write(sort_stat)
sorted_np = np.sort(data)
np_stat = f(rle(sorted_np))
with open('cpu.txt', 'w') as fp: fp.write(np_stat)
print 'is_sorted?', np.all(sorted == sorted_np)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:47,代码来源:sortbench.py
示例17: test_mwc
def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time
nthreads = blockwidth * nblocks
seeds = make_seeds(nthreads, host_seed=42)
dseeds = cuda.to_device(seeds)
mod = SourceModule(assemble_code(mwctestlib))
for trial in range(2):
print "Trial %d, on CPU: " % trial,
sums = np.zeros(nthreads, dtype=np.uint64)
ctime = time.time()
mults = seeds[0].astype(np.uint64)
states = seeds[1]
carries = seeds[2]
for i in range(rounds):
step = np.frombuffer((mults * states + carries).data,
dtype=np.uint32).reshape((2, nthreads), order='F')
states[:] = step[0]
carries[:] = step[1]
sums += states
ctime = time.time() - ctime
print "Took %g seconds." % ctime
print "Trial %d, on device: " % trial,
dsums = cuda.mem_alloc(8*nthreads)
fun = mod.get_function("test_mwc")
dtime = fun(dseeds, dsums, np.float32(rounds),
block=(blockwidth,1,1), grid=(nblocks,1),
time_kernel=True)
print "Took %g seconds." % dtime
dsums = cuda.from_device(dsums, nthreads, np.uint64)
if not np.all(np.equal(sums, dsums)):
print "Sum discrepancy!"
print sums
print dsums
开发者ID:vincentmele,项目名称:cuburn,代码行数:40,代码来源:mwc.py
示例18: show
def show(s, a):
print a
print cuda.from_device( s.arr_gpu, (6,5), np.float32 ).T
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:3,代码来源:boostmpi-multiprocessing-02.py
示例19: send
def send(s,rank,tag_mark):
a = cuda.from_device(s.arr_gpu,(6,5),np.float32)
mpi.world.send( rank,tag_mark, a)
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:3,代码来源:boostmpi-multiprocessing-02.py
示例20: _print_interp_knots
def _print_interp_knots(self, rdr, tsidx=5):
infos = cuda.from_device(self.info_a.d_params,
(tsidx + 1, len(rdr.packer)), f32)
for i, n in zip(infos[-1], rdr.packer.packed):
print '%60s %g' % ('_'.join(n), i)
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:5,代码来源:render.py
注:本文中的pycuda.driver.from_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论