• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python driver.memcpy_htod_async函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.driver.memcpy_htod_async函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_htod_async函数的具体用法?Python memcpy_htod_async怎么用?Python memcpy_htod_async使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了memcpy_htod_async函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: set

    def set(self, ary, device=None):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
            device: device id, if not the one attached to current context
        Returns:
            self
        """
        stream = self.backend.stream
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == self.strides

        if device is None:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
        else:
            # with multithreaded datasets, make a context before copying
            # and destroy it again once done.
            lctx = drv.Device(device).make_context()
            drv.memcpy_htod_async(self.gpudata, ary, stream)
            lctx.pop()
            del lctx

        return self
开发者ID:leonardt,项目名称:nervanagpu,代码行数:27,代码来源:nervanagpu.py


示例2: set_refsmiles

    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
开发者ID:ihaque,项目名称:SIML,代码行数:56,代码来源:GPULingo.py


示例3: set_async

    def set_async(self, ary, stream=None):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        assert self.flags.forc

        if not ary.flags.forc:
            raise RuntimeError("cannot asynchronously set from " "non-contiguous array")

        if self.size:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
开发者ID:abergeron,项目名称:pycuda,代码行数:10,代码来源:gpuarray.py


示例4: set

 def set(self, tensor, data):
     assert isinstance(tensor, MGPUTensor)
     if tensor.ptype == 'replica':
         for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
             ctx.push()
             drv.memcpy_htod_async(dest.ptr, data, strm)
             ctx.pop()
         # tensor.copy_from(data)
     else:
         self.scatter(data, tensor)
开发者ID:neuroidss,项目名称:neon,代码行数:10,代码来源:mgpu.py


示例5: exchange

def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
	ctx1 = cuda.Device(dev1).make_context()
	a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
	ctx1.pop()

	ctx2 = cuda.Device(dev2).make_context()
	cuda.memcpy_htod(int(b_gpu), a)
	b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
	ctx2.pop()

	ctx1 = cuda.Device(dev1).make_context()
	cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
	ctx1.pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:13,代码来源:03-3GPU.py


示例6: kernel_write

def kernel_write(function_name, dest_devptr, dest_info, source_devptr, source_info, work_range, stream=None):
	global KD

	# initialize variables
	global tb_cnt
	tb_cnt = 0

	# dest
	cuda_args = [dest_devptr]
	cuda_args += [dest_info]

	# source
	cuda_args += [source_devptr]
	cuda_args += [source_info]

	# work_range
	cuda_args += make_cuda_list(work_range)

	# initialize model view
	eye = numpy.eye(4,dtype=numpy.float32)
	cuda.memcpy_htod_async(mmtx, eye, stream=stream)
	cuda.memcpy_htod_async(inv_mmtx, eye, stream=stream)
	
	try:
		if Debug:
			print "Function name: ", function_name
		func = mod.get_function(function_name) #cutting function
	except:
		print "Function not found ERROR"
		print "Function name: ", function_name
		assert(False)
		
	# set work range
	block, grid = range_to_block_grid(work_range)

	if log_type in ['time', 'all']:
		st = time.time()

	func(*cuda_args, block=block, grid=grid, stream=stream)

	#ctx.synchronize()
	
	KD.append((dest_info, source_info))
	

	if log_type in ['time', 'all']:
		bytes = make_bytes(work_range,3)
		t = MPI.Wtime()-st
		ms = 1000*t
		bw = bytes/GIGA/t
		log("rank%d, GPU%d, , kernel write time, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec "%(rank, device_number, bytes/MEGA, ms, bw),'time', log_type)
开发者ID:davidhildebrand,项目名称:Vivaldi_public,代码行数:51,代码来源:GPU_unit.py


示例7: set_async

 def set_async(self, ary, stream=None):
     assert ary.ndim <= 3
     assert ary.dtype == ary.dtype
     
     assert ary.size == self.size
     
     if ary.base.__class__ != cuda.HostAllocation:
             raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
             
     if self.size:
         if self.M == 1:
             cuda.memcpy_htod_async(self.gpudata, ary, stream)
         else:
             PitchTrans(self.shape, self.gpudata, self.ld, ary, _pd(self.shape), self.dtype, async = True, stream = stream)
开发者ID:bionet,项目名称:vtem,代码行数:14,代码来源:parray.py


示例8: synchronize_isdone

    def synchronize_isdone(self):
        """ Complete synchronization process. """

        # Use shorter, easier names for class variables.
        bufs = self._sync_buffers
        ptrs = self._sync_ptrs
        streams = self._sync_streams
        adj = self._sync_adj
        part2_start = self._sync_part2_start 
        is_done = [False, False, False, False]

        # Forward send.
        if streams[0].is_done(): # Device-to-host copy completed.
            if not part2_start[0]: # Initialize MPI send.
                comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0])
                part2_start[0] = True
                is_done[0] = True
            else: # No more work to do.
                is_done[0] = True

        # Backward send.
        if streams[1].is_done(): # Device-to-host copy completed.
            if not part2_start[1]: # Initialize MPI send.
                comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1])
                part2_start[1] = True
                is_done[1] = True
            else: # No more work to do.
                is_done[1] = True

        # Forward receive.
        if self._sync_req_forw.Test(): # MPI receive completed.
            if not part2_start[2]: # Initialize host-to-device copy.
                drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \
                                        stream=streams[2]) # Host-to-device.
                part2_start[2] = True
            elif streams[2].is_done(): # Host-to-device copy completed.
                is_done[2] = True

        # Backward receive.
        if self._sync_req_back.Test(): # MPI receive completed.
            if not part2_start[3]: # Initialize host-to-device copy.
                drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \
                                        stream=streams[3]) # Host-to-device.
                part2_start[3] = True
            elif streams[3].is_done(): # Host-to-device copy completed.
                is_done[3] = True
        # print '~', is_done[0:4],
        # Return true only when all four transfers are complete.
        return all(is_done) 
开发者ID:JesseLu,项目名称:maxwell-solver,代码行数:49,代码来源:grid.py


示例9: load_data_on_gpu

    def load_data_on_gpu(tl_args, module):
        d_V = module.get_global('d_V')[0]
        cuda.memcpy_htod_async(d_V, tl_args.V)

        d_c = module.get_global('d_c')[0]
        cuda.memcpy_htod_async(d_c, tl_args.c)

        d_I = module.get_global('d_I')[0]
        cuda.memcpy_htod_async(d_I, tl_args.I)

        d_E = module.get_global('d_E')[0]
        cuda.memcpy_htod_async(d_E, tl_args.E)

        d_x_0 = module.get_global('d_x_0')[0]
        cuda.memcpy_htod_async(d_x_0, tl_args.x_0)
开发者ID:ucl-cssb,项目名称:py_stoch,代码行数:15,代码来源:cuGillespie.py


示例10: set_async

    def set_async(self, ary, stream=None):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        if ary.strides != self.strides:
            from warnings import warn
            warn("Setting array from one with different strides/storage order. "
                    "This will cease to work in 2013.x.",
                    stacklevel=2)

        assert self.flags.forc

        if not ary.flags.forc:
            raise RuntimeError("cannot asynchronously set from "
                    "non-contiguous array")

        if self.size:
            drv.memcpy_htod_async(self.gpudata, ary, stream)
开发者ID:hannes-brt,项目名称:pycuda,代码行数:17,代码来源:gpuarray.py


示例11: test_register_host_memory

    def test_register_host_memory(self):
        if drv.get_version() < (4,):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20,), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()
开发者ID:chunggi,项目名称:pycuda,代码行数:17,代码来源:test_driver.py


示例12: run

            def run(self, scomp, scopy):
                # If we are unpacking then copy the host buffer to the GPU
                if op == 'unpack':
                    cuda.memcpy_htod_async(m.data, m.hdata, scopy)
                    event.record(scopy)
                    scomp.wait_for_event(event)

                # Call the CUDA kernel (pack or unpack)
                fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol,
                                       v.mapping, v.strides, m,
                                       v.mapping.leaddim, v.strides.leaddim,
                                       m.leaddim)

                # If we have been packing then copy the GPU buffer to the host
                if op == 'pack':
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
开发者ID:bartwozniak,项目名称:PyFR,代码行数:18,代码来源:packing.py


示例13: set

    def set(self, ary):
        """
        copy host array to device.
        Arguments:
            ary: host array, needs to be contiguous
        Returns:
            self
        """
        stream = self.backend.stream
        assert ary.size == self.size
        assert self.is_contiguous, "Array in set() must be contiguous"
        if ary.dtype is not self.dtype:
            ary = ary.astype(self.dtype)
        assert ary.strides == tuple(self.dtype.itemsize*s for s in self.strides)

        drv.memcpy_htod_async(self.gpudata, ary, stream)

        return self
开发者ID:davidoj,项目名称:nervanagpu,代码行数:18,代码来源:nervanagpu.py


示例14: _interp

    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:19,代码来源:render.py


示例15: test_streamed_kernel

    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0
开发者ID:minrk,项目名称:PyCUDA,代码行数:41,代码来源:test_driver.py


示例16: scatter

    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()
开发者ID:neuroidss,项目名称:neon,代码行数:21,代码来源:mgpu.py


示例17: asy_cpy

def asy_cpy(a, a_gpu, auto_init_context=True):
    """Data transfer from host to device.
    
    Asynchronous will be enabled when auto_init_context is True, otherwise
    use normal transfer.
    """

    import pycuda.driver as drv

    if auto_init_context:
        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)

        # Test correctness
        # ctx.synchronize()
        # b= numpy.zeros_like(a, a.dtype)
        # drv.memcpy_dtoh(b, a_gpu)
        # print numpy.allclose(a, b)
        return strm
    else:
        drv.memcpy_htod(a_gpu, a)
开发者ID:budiaji,项目名称:anuga-cuda,代码行数:21,代码来源:utility.py


示例18: _copy

    def _copy(self, rdr, gnm):
        """
        Queue a copy of a host genome into a set of device interpolation source
        buffers.

        Note that for now, this is broken! It ignores ``gnm``, and only packs
        the genome that was used when creating the renderer.
        """
        times, knots = rdr.packer.pack(gnm, self.fb.pool)
        cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)

        palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm["palette"]])
        ptimes, pvals = zip(*sorted(palsrc.items()))
        palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32)
        palettes[:] = pvals
        palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32)
        palette_times.fill(1e9)
        palette_times[: len(ptimes)] = ptimes
        cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times, self.stream_a)
开发者ID:vincentmele,项目名称:cuburn,代码行数:21,代码来源:render.py


示例19: run

 def run(self, queue):
     cuda.memcpy_htod_async(mv.data, mv.hdata,
                            queue.cuda_stream_comp)
开发者ID:pv101,项目名称:PyFR,代码行数:3,代码来源:packing.py


示例20: async_copy

 def async_copy(self, dest, src, stream=None):
     drv.memcpy_htod_async(dest.gpudata, src, stream)
开发者ID:YouVentures,项目名称:neon,代码行数:2,代码来源:gpu.py



注:本文中的pycuda.driver.memcpy_htod_async函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python driver.pagelocked_empty函数代码示例发布时间:2022-05-25
下一篇:
Python driver.memcpy_htod函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap