本文整理汇总了Python中pycuda.driver.memcpy_dtoh_async函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_dtoh_async函数的具体用法?Python memcpy_dtoh_async怎么用?Python memcpy_dtoh_async使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了memcpy_dtoh_async函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: data_finder
def data_finder(u, ss, sp, gpu_direct=True):
data_package = data_list[u][ss][sp]
dp = data_package.copy()
memory_type = dp.memory_type
if memory_type == 'devptr':
if gpu_direct:
devptr = data_list[u][ss][sp].devptr
return devptr, dp
else:
devptr = data_list[u][ss][sp].devptr
shape = dp.data_memory_shape
bcmd = dp.data_contents_memory_dtype
if log_type in ['time','all']: st = time.time()
buf = numpy.empty((shape), dtype=bcmd)
cuda.memcpy_dtoh_async(buf, devptr, stream=stream[1])
# buf = cuda.from_device(devptr, shape, bcmd)
if log_type in ['time','all']:
u = dp.unique_id
bytes = dp.data_bytes
t = MPI.Wtime()-st
ms = 1000*t
bw = bytes/GIGA/t
log("rank%d, \"%s\", u=%d, GPU%d data transfer from GPU memory to CPU memory, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, device_number, bytes/MEGA, ms, bw),'time', log_type)
dp.memory_type = 'memory'
dp.data_dtype = type(buf)
return buf, dp
else:
data = data_list[u][ss][sp].data
return data, dp
return None, None
开发者ID:Anukura,项目名称:Vivaldi,代码行数:33,代码来源:GPU_unit.py
示例2: synchronize_start
def synchronize_start(self):
""" Start the synchronization process. """
# Use shorter, easier names for class variables.
bufs = self._sync_buffers
ptrs = self._sync_ptrs
streams = self._sync_streams
adj = self._sync_adj
# Start the transfer operations needed.
self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags.
# Forward send.
drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0])
# Backward send.
drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1])
# Forward receive.
self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \
tag=self._sync_tags[0])
# Backward receive.
self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \
tag=self._sync_tags[1])
# Signalling variables needed to complete transfers.
self._sync_part2_start = [False, False, False, False]
开发者ID:JesseLu,项目名称:maxwell-solver,代码行数:28,代码来源:grid.py
示例3: send
def send(data, data_package, dest=None, gpu_direct=True):
global s_requests
tag = 52
dp = data_package
# send data_package
send_data_package(dp, dest=dest, tag=tag)
bytes = dp.data_bytes
memory_type = dp.memory_type
if log_type in ['time','all']: st = time.time()
flag = False
request = None
if memory_type == 'devptr': # data in the GPU
if gpu_direct: # want to use GPU direct
devptr = data
buf = MPI.make_buffer(devptr.__int__(), bytes)
ctx.synchronize()
request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57)
if VIVALDI_BLOCKING: MPI.Request.Wait(request)
s_requests.append((request, buf, devptr))
flag = True
else:# not want to use GPU direct
# copy to CPU
shape = dp.data_memory_shape
dtype = dp.data_contents_memory_dtype
buf = numpy.empty(shape, dtype=dtype)
cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1])
request = comm.Isend(buf, dest=dest, tag=57)
if VIVALDI_BLOCKING: MPI.Request.Wait(request)
s_requests.append((request, buf, None))
else: # data in the CPU
# want to use GPU direct, not exist case
# not want to use GPU direct
if dp.data_dtype == numpy.ndarray:
request = comm.Isend(data, dest=dest, tag=57)
if VIVALDI_BLOCKING: MPI.Request.Wait(request)
s_requests.append((request, data, None))
if log_type in ['time','all']:
u = dp.unique_id
bytes = dp.data_bytes
t = MPI.Wtime()-st
ms = 1000*t
bw = bytes/GIGA/t
if flag:
log("rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
else:
log("rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type)
return request
开发者ID:Anukura,项目名称:Vivaldi,代码行数:56,代码来源:GPU_unit.py
示例4: run
def run(self, scomp, scopy):
# Pack
kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
v.nvcol, v.basedata, v.mapping,
v.cstrides or 0, v.rstrides or 0, m)
# Copy the packed buffer to the host
event.record(scomp)
scopy.wait_for_event(event)
cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
开发者ID:Aerojspark,项目名称:PyFR,代码行数:10,代码来源:packing.py
示例5: copy
def copy(self, fb, dim, pool, stream=None):
fmt = 'u1'
if self.pix_fmt in ('yuv444p10', 'yuv420p10', 'yuv444p12'):
fmt = 'u2'
dims = (3, dim.h, dim.w)
if self.pix_fmt == 'yuv420p10':
dims = (dim.h * dim.w * 6 / 4,)
h_out = pool.allocate(dims, fmt)
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:10,代码来源:output.py
示例6: get_async
def get_async(self, stream=None, ary=None):
if ary is None:
ary = drv.pagelocked_empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
if self.size:
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
开发者ID:minrk,项目名称:PyCUDA,代码行数:10,代码来源:gpuarray.py
示例7: get
def get(self, stream=None):
"""
copy device array to host.
Returns:
the host numpy array
"""
assert self.is_contiguous, "Array in get() must be contiguous"
ary = np.empty(self.shape, self.dtype)
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
开发者ID:davidoj,项目名称:nervanagpu,代码行数:10,代码来源:nervanagpu.py
示例8: cpy_back
def cpy_back(a, a_gpu, auto_init_context=True):
"""Data transfer from device to host.
Asynchronous will be enabled when auto_init_context is True, otherwise
use normal transfer.
"""
import pycuda.driver as drv
if auto_init_context:
strm = drv.Stream()
drv.memcpy_dtoh_async(a, a_gpu, strm)
return strm
else:
drv.memcpy_dtoh(a, a_gpu)
开发者ID:budiaji,项目名称:anuga-cuda,代码行数:15,代码来源:utility.py
示例9: get_async
def get_async(self, stream=None, ary=None):
if ary is None:
ary = drv.pagelocked_empty(self.shape, self.dtype)
ary = _as_strided(ary, strides=self.strides)
else:
assert ary.size == self.size
assert ary.dtype == self.dtype
assert ary.flags.forc
assert self.flags.forc, "Array in get() must be contiguous"
if self.size:
drv.memcpy_dtoh_async(ary, self.gpudata, stream)
return ary
开发者ID:hannes-brt,项目名称:pycuda,代码行数:15,代码来源:gpuarray.py
示例10: get_async
def get_async(self, stream = None, ary = None):
if ary is None:
ary = cuda.pagelocked_empty(self.shape, self.dtype)
else:
assert ary.size == self.size
assert ary.dtype == ary.dtype
if ary.base.__class__ != cuda.HostAllocation:
raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
if self.size:
if self.M == 1:
cuda.memcpy_dtoh_async(ary, self.gpudata, stream)
else:
PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream)
return ary
开发者ID:bionet,项目名称:vtem,代码行数:17,代码来源:parray.py
示例11: get_host_result
def get_host_result(self):
if not self.gpu_finished:
if self.gpu_finished_evt.query():
self.gpu_finished = True
self.copy_stream = get_stream()
self.host_dest = self.pagelocked_allocator(
self.gpu_result.shape, self.gpu_result.dtype,
self.copy_stream)
drv.memcpy_dtoh_async(self.host_dest,
self.gpu_result.gpudata,
self.copy_stream)
self.copy_finished_evt = drv.Event()
self.copy_finished_evt.record()
else:
if self.copy_finished_evt.query():
STREAM_POOL.append(self.copy_stream)
return self.host_dest
开发者ID:FreddieWitherden,项目名称:pycuda,代码行数:17,代码来源:inner.py
示例12: run
def run(self, scomp, scopy):
# If we are unpacking then copy the host buffer to the GPU
if op == 'unpack':
cuda.memcpy_htod_async(m.data, m.hdata, scopy)
event.record(scopy)
scomp.wait_for_event(event)
# Call the CUDA kernel (pack or unpack)
fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
v.mapping, v.strides, m,
v.mapping.leaddim, v.strides.leaddim,
m.leaddim)
# If we have been packing then copy the GPU buffer to the host
if op == 'pack':
event.record(scomp)
scopy.wait_for_event(event)
cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
开发者ID:bartwozniak,项目名称:PyFR,代码行数:18,代码来源:packing.py
示例13: test_streamed_kernel
def test_streamed_kernel(self):
# this differs from the "simple_kernel" case in that *all* computation
# and data copying is asynchronous. Observe how this necessitates the
# use of page-locked memory.
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x*blockDim.y + threadIdx.y;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
import numpy
shape = (32,8)
a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
a[:] = numpy.random.randn(*shape)
b[:] = numpy.random.randn(*shape)
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
strm = drv.Stream()
drv.memcpy_htod_async(a_gpu, a, strm)
drv.memcpy_htod_async(b_gpu, b, strm)
strm.synchronize()
dest = drv.pagelocked_empty_like(a)
multiply_them(
drv.Out(dest), a_gpu, b_gpu,
block=shape+(1,), stream=strm)
strm.synchronize()
drv.memcpy_dtoh_async(a, a_gpu, strm)
drv.memcpy_dtoh_async(b, b_gpu, strm)
strm.synchronize()
assert la.norm(dest-a*b) == 0
开发者ID:minrk,项目名称:PyCUDA,代码行数:41,代码来源:test_driver.py
示例14: threshold_integrated
def threshold_integrated(series, value):
global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val
t = numpy.float32(value**2)
nb = int(numpy.ceil(float(len(series))/nt/gs))
if _bn is None or len(_bn) < nb:
_bn = gpuarray.zeros(nb, dtype=numpy.uint32)
if _n is None:
_n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
ptr = numpy.intp(_n.base.get_device_pointer())
class T():
pass
_dn = T()
_dn.gpudata = ptr
_dn.flags = _n.flags
if _loc_tmp is None or len(series) > len(_loc_tmp):
_loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32)
_loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32)
_val_out = gpuarray.zeros(len(series), dtype=series.dtype)
_val = driver.pagelocked_empty((4096*256), numpy.complex64)
_loc = driver.pagelocked_empty((4096*256), numpy.uint32)
#Do the thresholding by block
stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1))
# Recombine the blocks into a final output
stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1))
# We need to get the data back now
pycbc.scheme.mgr.state.context.synchronize()
if _n != 0:
driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata)
driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata)
pycbc.scheme.mgr.state.context.synchronize()
return _loc[0:_n], _val[0:_n]
开发者ID:AbhayMK,项目名称:pycbc,代码行数:38,代码来源:threshold_cuda.py
示例15: len
if len(shape) <= 1:
if isinstance(src, GPUArray):
if isinstance(dst, GPUArray):
if async:
drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
else:
drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
else:
# The arrays might be contiguous in the sense of
# having no gaps, but the axes could be transposed
# so that the order is neither Fortran or C.
# So, we attempt to get a contiguous view of dst.
dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
if async:
drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
else:
drv.memcpy_dtoh(dst, src.gpudata)
else:
src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
if async:
drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
else:
drv.memcpy_htod(dst.gpudata, src)
return
if len(shape) == 2:
copy = drv.Memcpy2D()
elif len(shape) == 3:
copy = drv.Memcpy3D()
else:
开发者ID:Benli11,项目名称:pycuda,代码行数:30,代码来源:gpuarray.py
示例16: from_buf_async
def from_buf_async(self, cl_buf, stream=None):
cuda.memcpy_dtoh_async(self.buffers[cl_buf], cl_buf, stream)
开发者ID:mjanusz,项目名称:sailfish,代码行数:2,代码来源:backend_cuda.py
示例17: copy
def copy(self, fb, dim, pool, stream=None):
h_out = pool.allocate((dim.h, dim.w, 4), "u1")
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
开发者ID:vincentmele,项目名称:cuburn,代码行数:4,代码来源:output.py
示例18: SourceModule
Stream2 = drv.Stream()
MT_state_buf = drv.mem_alloc(SIZE * MT_N * 4)
MT_state_res_buf = drv.mem_alloc(MT_state_result.nbytes)
prg = SourceModule(
transform_to_cuda(
gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH)
)
)
prog = prg.get_function('mt_brute')
zzz = time.time()
ev = prog(np.uint32(0), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream)
drv.memcpy_dtoh_async(MT_state_result, MT_state_res_buf, stream=Stream2)
for i in xrange(TEST_ITERATIONS):
prog(np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream)
drv.memcpy_dtoh(MT_state_result, MT_state_res_buf)#, stream=Stream2)
zzz = time.time() - zzz
print '>>>', zzz
for row in MT_state_result[0]:
print row
MT_state_buf.free()
MT_state_res_buf.free()
开发者ID:Gifts,项目名称:pyphp_rand_ocl,代码行数:31,代码来源:cuda_test.py
示例19: xrange
flop = 3*(nx*ny*nz*30)*tgap
flops = np.zeros(tmax/tgap+1)
start, stop = cuda.Event(), cuda.Event()
start.record()
# main loop
ey_tmp = cuda.pagelocked_zeros((ny,nz),'f')
ez_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hy_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hz_tmp = cuda.pagelocked_zeros_like(ey_tmp)
stream1 = cuda.Stream()
for tn in xrange(1, tmax+1):
update_h.prepared_async_call(bpg0, stream1, np.int32(By), *eh_args)
for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args)
if rank == 0:
cuda.memcpy_dtoh_async(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1)
cuda.memcpy_dtoh_async(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1)
stream1.synchronize()
comm.Send(hy_tmp, 1, 20)
comm.Send(hz_tmp, 1, 21)
elif rank == 1:
comm.Recv(hy_tmp, 0, 20)
comm.Recv(hz_tmp, 0, 21)
cuda.memcpy_htod_async(int(hy_gpu), hy_tmp, stream1)
cuda.memcpy_htod_async(int(hz_gpu), hz_tmp, stream1)
cuda.memcpy_dtoh_async(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1)
cuda.memcpy_dtoh_async(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1)
stream1.synchronize()
comm.Send(hy_tmp, 2, 20)
comm.Send(hz_tmp, 2, 21)
elif rank == 2:
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:134-3GPU-pinned-async.py
示例20: convert_image_rgb
def convert_image_rgb(self, image):
global program
start = time.time()
iplanes = image.get_planes()
w = image.get_width()
h = image.get_height()
stride = image.get_rowstride()
pixels = image.get_pixels()
debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
assert iplanes==ImageWrapper.PACKED, "must use packed format as input"
assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
divs = get_subsampling_divs(self.dst_format)
#copy packed rgb pixels to GPU:
upload_start = time.time()
stream = driver.Stream()
mem = numpy.frombuffer(pixels, dtype=numpy.byte)
in_buf = driver.mem_alloc(len(pixels))
hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
pycuda.driver.memcpy_htod_async(in_buf, mem, stream)
out_bufs = []
out_strides = []
out_sizes = []
for i in range(3):
x_div, y_div = divs[i]
out_stride = roundup(self.dst_width/x_div, 4)
out_height = roundup(self.dst_height/y_div, 2)
out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
out_bufs.append(out_buf)
out_strides.append(out_stride)
out_sizes.append((out_stride, out_height))
#ensure uploading has finished:
stream.synchronize()
#we can now unpin the host memory:
hmem.base.unregister()
debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start))
kstart = time.time()
kargs = [in_buf, numpy.int32(stride)]
for i in range(3):
kargs.append(out_bufs[i])
kargs.append(numpy.int32(out_strides[i]))
blockw, blockh = 16, 16
#figure out how many pixels we process at a time in each dimension:
xdiv = max([x[0] for x in divs])
ydiv = max([x[1] for x in divs])
gridw = max(1, w/blockw/xdiv)
if gridw*2*blockw<w:
gridw += 1
gridh = max(1, h/blockh/ydiv)
if gridh*2*blockh<h:
gridh += 1
debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1))
self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh))
#we can now free the GPU source buffer:
in_buf.free()
kend = time.time()
debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
self.frames += 1
#copy output YUV channel data to host memory:
read_start = time.time()
pixels = []
strides = []
for i in range(3):
x_div, y_div = divs[i]
out_size = out_sizes[i]
#direct full plane async copy keeping current GPU padding:
plane = driver.aligned_empty(out_size, dtype=numpy.byte)
driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
pixels.append(plane.data)
stride = out_strides[min(len(out_strides)-1, i)]
strides.append(stride)
stream.synchronize()
#the copying has finished, we can now free the YUV GPU memory:
#(the host memory will be freed by GC when 'pixels' goes out of scope)
for out_buf in out_bufs:
out_buf.free()
self.cuda_context.synchronize()
read_end = time.time()
debug("strides=%s", strides)
debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))
return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
开发者ID:svn2github,项目名称:Xpra,代码行数:85,代码来源:colorspace_converter.py
注:本文中的pycuda.driver.memcpy_dtoh_async函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论