• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python driver.memcpy_dtoh函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.driver.memcpy_dtoh函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_dtoh函数的具体用法?Python memcpy_dtoh怎么用?Python memcpy_dtoh使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了memcpy_dtoh函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: calcV1complex

    def calcV1complex(self, stim, speed):
        """Compute V1 complex cell responses of a frame."""

        # allocate stim on device
        self._loadInput(stim)

        # convolve the stimulus with separate V1 filters
        self._calcV1linear()

        # rectify linear response to get V1 simple cell firing rate
        self._calcV1rect()

        # spatial pooling to get V1 complex
        self._calcV1blur()

        # divisive normalization
        self._calcV1normalize()

        # steer filters in specified directions
        self._calcV1direction(speed)

        # get data from device
        res = np.zeros(self.nrX*self.nrY*self.nrDirs).astype(np.float32)
        cuda.memcpy_dtoh(res, self.d_respV1c)

        return res
开发者ID:UCI-CARL,项目名称:MotionEnergy,代码行数:26,代码来源:motionenergy.py


示例2: scenario_inplace_padded_C2R

def scenario_inplace_padded_C2R(batch,tic,toc):

  n = array([2*BENG_CHANNELS_],int32)
  inembed = array([16*(BENG_CHANNELS//16+1)],int32)
  onembed = array([2*inembed[0]],int32)
  plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0],
  	                                       onembed.ctypes.data, 1, onembed[0],
  					       cufft.CUFFT_C2R, batch)

  data_shape = (batch,inembed[0])
  cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape)
  cpu_data = cpu_data.astype(complex64)
  gpu_data  = cuda.mem_alloc(8*batch*inembed[0])		# complex64
  cuda.memcpy_htod(gpu_data,cpu_data)

  tic.record()
  cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data))
  toc.record()
  toc.synchronize()

  cpu_result = np.empty(batch*onembed[0],dtype=np.float32)
  cuda.memcpy_dtoh(cpu_result,gpu_data)
  cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_)
  result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1)
  print 'Batched in-place scenario'
  print 'test passed:',np.allclose(cpu_result,result)
  print 'GPU time:', tic.time_till(toc),' ms =  ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)' 
开发者ID:sma-wideband,项目名称:sdbe,代码行数:27,代码来源:fft_test.py


示例3: runTest

    def runTest(self):
        nx, ny, nz, str_f, pt0, pt1, is_array = self.args
        slice_xyz = common.slices_two_points(pt0, pt1)

        # generate random source
        if is_array:
            shape = common.shape_two_points(pt0, pt1)
            value = np.random.rand(*shape).astype(np.float32)
        else:
            value = np.random.ranf()

        # instance
        fields = Fields(0, nx, ny, nz, '', 'single')

        tfunc = lambda tstep: np.sin(0.03*tstep)
        incident = IncidentDirect(fields, str_f, pt0, pt1, tfunc, value) 

        # host allocations
        eh = np.zeros(fields.ns_pitch, dtype=fields.dtype)

        # verify
        eh[slice_xyz] = fields.dtype(value) * fields.dtype(tfunc(1))
        fields.update_e()
        fields.update_h()

        copy_eh_buf = fields.get_buf(str_f)
        copy_eh = np.zeros_like(eh)
        cuda.memcpy_dtoh(copy_eh, copy_eh_buf)

        original = eh[slice_xyz]
        copy = copy_eh[slice_xyz]
        norm = np.linalg.norm(original - copy)
        self.assertEqual(norm, 0, '%s, %g' % (self.args, norm))

        fields.context_pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:35,代码来源:test_incident_direct.py


示例4: calculate

    def calculate (self, data, f_high, f_bins):
        
        import pycuda.driver as driver
        import pycuda.compiler as compiler
        import pycuda.autoinit
        
        log = logging.getLogger("astroplpython.function.signal")   
        log.debug("CULSP.calculate() called")
        
        log.debug("Orig Data:"+str(data)) 
        
        log.debug(" TODO: Calculate blocksize")

        log.debug("set up GPU, allocate memory for working")
        a_gpu = driver.mem_alloc(data.size * data.dtype.itemsize)
        
        log.debug("push data into GPU memory")
        driver.memcpy_htod(a_gpu, data)
        
        log.debug("compile and run the culsp_kernel on data in the GPU")
        culsp_func = compiler.SourceModule(self._kernelStr).get_function("culsp_kernel") 
        culsp_func (a_gpu, block=(4,4,1))

        log.debug("pull data from GPU back into main memory")
        result = np.empty_like(data)
        driver.memcpy_dtoh(result, a_gpu)
        
        log.debug("return result") 
        return result
开发者ID:brianthomas,项目名称:astroplpython,代码行数:29,代码来源:LSPeriodogram.py


示例5: poisson_parallel

def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
	# create Cheetah template and fill in variables for Poisson kernal
  	template = Template(poisson_blending_source)
  	template.BLOCK_DIM_X = b_size[0]
  	template.BLOCK_DIM_Y = b_size[1]
  	template.WIDTH = dest_im.shape[1]
  	template.HEIGHT = dest_im.shape[0]
  	template.RGB = RGB
  	template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")

  	# alloc memory in GPU
  	out_image = np.array(dest_im, dtype =np.uint8)
  	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
  	cu.memcpy_htod(d_source, source_im)
  	cu.memcpy_htod(d_destination, dest_im)
  	cu.memcpy_htod(d_buffer, interior_buffer)

  	# calls CUDA for Poisson Blending n # of times
  	for i in range(n):
		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)

	# retrieves the final output image and returns
	cu.memcpy_dtoh(out_image, d_destination)
  	return out_image
开发者ID:JMTing,项目名称:cs205,代码行数:27,代码来源:parallel_poisson.py


示例6: diffuse_pycuda

def diffuse_pycuda(u):
    
    nx,ny = np.int32(u.shape)
    alpha = np.float32(0.645)
    dx = np.float32(3.5/(nx-1))
    dy = np.float32(3.5/(ny-1))
    dt = np.float32(1e-05)
    time = np.float32(0.4)
    nt = np.int32(np.ceil(time/dt))
#     print nt
    
    u[0,:]=200
    u[:,0]=200  
    
    u = u.astype(np.float32)
    
    u_prev = u.copy()    
    
    u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
    u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
    cuda.memcpy_htod(u_d, u)
    cuda.memcpy_htod(u_prev_d, u_prev)

    BLOCKSIZE = 16
    gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
    blockSize = (BLOCKSIZE,BLOCKSIZE,1)

    for t in range(nt+1):
        copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
        update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
    
    cuda.memcpy_dtoh(u, u_d)
    
    return u
开发者ID:htapia,项目名称:lania.pd,代码行数:34,代码来源:diffuse.py


示例7: fromGPU

 def fromGPU(self, shared_mem, buff_dtype=np.float32 ):
     
     buff = np.frombuffer(shared_mem.get_obj(), dtype=buff_dtype)
     buff = buff[:self.buffer_nnets*self.buffer_nsamples]
     buff = buff.reshape( (self.buffer_nnets, self.buffer_nsamples) )
     cuda.memcpy_dtoh(buff, self.gpu_data)
     return buff
开发者ID:JohnCEarls,项目名称:GPUDirac,代码行数:7,代码来源:data.py


示例8: test_constant_memory

    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function("copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0,255,(32,)).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(
                global_result_array,
                grid=(1, 1), block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all
开发者ID:davidweichiang,项目名称:pycuda,代码行数:28,代码来源:test_driver.py


示例9: calc_bandwidth_d2h

	def calc_bandwidth_d2h( s ):
		t1 = datetime.now()
		cuda.memcpy_dtoh( s.a, s.dev_a )
		dt = datetime.now() - t1
		dt_float = dt.seconds + dt.microseconds*1e-6

		return s.nbytes/dt_float/gbytes
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:7,代码来源:150-gpus-mpi-range-h5-seperate.py


示例10: fromSourceFile

def fromSourceFile():
    import numpy as np
    import pycuda.driver as cuda
    import pycuda.autoinit
    from pycuda.compiler import SourceModule

    #random data
    np.random.seed(1)
    a = np.random.randn(4,4)
    a = a.astype(np.float32)

    #read code and get function
    mod = SourceModule(open('simple.cu').read())
    func = mod.get_function("doublify")

    #allocate memory on the GPU
    a_gpu = cuda.mem_alloc(a.nbytes)

    #transfer to the GPU memory
    cuda.memcpy_htod(a_gpu, a)

    #execute
    func(a_gpu, block=(4,4,1))

    #collect results
    a_doubled = np.empty_like(a)
    cuda.memcpy_dtoh(a_doubled, a_gpu)

    print a_doubled
    print a_doubled / (a*2)
开发者ID:eddienko,项目名称:EuclidVisibleInstrument,代码行数:30,代码来源:cudaTests.py


示例11: test_prepared_invocation

    def test_prepared_invocation(self):
        a = np.random.randn(4,4).astype(np.float32)
        a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize)

        drv.memcpy_htod(a_gpu, a)

        mod = SourceModule("""
            __global__ void doublify(float *a)
            {
              int idx = threadIdx.x + threadIdx.y*blockDim.x;
              a[idx] *= 2;
            }
            """)

        func = mod.get_function("doublify")
        func.prepare("P")
        func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20)
        a_doubled = np.empty_like(a)
        drv.memcpy_dtoh(a_doubled, a_gpu)
        print (a)
        print (a_doubled)
        assert la.norm(a_doubled-2*a) == 0

        # now with offsets
        func.prepare("P")
        a_quadrupled = np.empty_like(a)
        func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize)
        drv.memcpy_dtoh(a_quadrupled, a_gpu)
        assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:29,代码来源:test_driver.py


示例12: loop

def loop(iterations):
    ts = 0
    while(ts<iterations):
        ' To avoid overwrites a temporary copy is made of F '
        T[:] = F
        cuda.memcpy_htod(T_gpu, T)
        
        ' Propagate '
        prop(F_gpu, T_gpu, 
             block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate density and get bounceback from obstacle nodes '
        density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu,
                block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
        
        ' Calculate equilibrium '
        eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu, 
           U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1), 
           grid=(gridDimX,gridDimY))
        
        ' Transfer bounceback to obstacle nodes '
        bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu,
                   block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
                              
        ' Copy F to host for copy to T in beginning of loop '
        cuda.memcpy_dtoh(F, F_gpu)
        
        ts += 1
开发者ID:hohiroki,项目名称:Lattice-Boltzmann,代码行数:28,代码来源:lbm2dcu.py


示例13: cuda_crossOver

def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py


示例14: _debug_print

	def _debug_print( self ) :
		cuda_driver.memcpy_dtoh( self.f , self.df1 )

		np.set_printoptions( 3 , 10000 , linewidth = 200 , suppress = True )

		print '#'*80
		print self.f
开发者ID:jkotur,项目名称:particles,代码行数:7,代码来源:lbm.py


示例15: convolution_cuda

def convolution_cuda(sourceImage,  filterx,  filtery):
    # Perform separable convolution on sourceImage using CUDA.
    # Operates on floating point images with row-major storage.
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32',  'source image must be float32'
    (imageHeight,  imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16)
    DATA_H = imageHeight
    BYTES_PER_WORD = 4  # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns,  filtery)
    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
    DATA_H = numpy.int32(DATA_H)
    DATA_W = numpy.int32(DATA_W)
    convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=[int(e) for e in blockGridRows],  block=[int(e) for e in threadBlockRows])    
    convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),  numpy.int32(DATA_W * threadBlockColumns[1]),  grid=[int(e) for e in blockGridColumns],  block=[int(e) for e in threadBlockColumns])

    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage,  destImage_gpu)
    return destImage
开发者ID:eddienko,项目名称:EuclidVisibleInstrument,代码行数:34,代码来源:Convolution.py


示例16: interior_buffer

def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
	# create Cheetah template and fill in variables for mask kernel
	mask_template = Template(mask_source)
	mask_template.BLOCK_DIM_X = b_size[0]
  	mask_template.BLOCK_DIM_Y = b_size[1]
  	mask_template.WIDTH = dest_im.shape[1]
  	mask_template.HEIGHT = dest_im.shape[0]
  	mask_template.RGB = RGB
  	mask_template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	mask_kernel = cuda_compile(mask_template, "mask_kernel")

  	# alloc memory to GPU
  	d_source = cu.mem_alloc(source_im.nbytes)
  	cu.memcpy_htod(d_source, source_im)

  	# sends to GPU filter out interior points in the mask
  	mask_kernel(d_source, block=b_size, grid=g_size)

  	# retrieves interior point buffer from GPU
  	inner_buffer = np.array(dest_im, dtype =np.uint8)
  	cu.memcpy_dtoh(inner_buffer, d_source)

  	# returns the interior buffer
  	return inner_buffer
开发者ID:JMTing,项目名称:cs205,代码行数:26,代码来源:parallel_poisson.py


示例17: calc_blob_blob_forces_pycuda

def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  L = kwargs.get('periodic_length')
  eps = kwargs.get('repulsion_strength')
  b = kwargs.get('debye_length')
  blob_radius = kwargs.get('blob_radius')

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
  f = np.empty_like(x)
        
  # Allocate GPU memory
  x_gpu = cuda.mem_alloc(x.nbytes)
  f_gpu = cuda.mem_alloc(f.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get blob-blob force function
  force = mod.get_function("calc_blob_blob_force")

  # Compute mobility force product
  force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) 
   
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(f, f_gpu)

  return np.reshape(f, (number_of_blobs, 3))
开发者ID:stochasticHydroTools,项目名称:RigidMultiblobsWall,代码行数:33,代码来源:forces_pycuda_user_defined.py


示例18: test_compare_order

def test_compare_order():
    '''
    compare_order between C(row-major), F(column-major)
    '''
    compare_order = mod_cu.get_function('compare_order')


    nx, ny = 3, 4
    f_1d = np.arange(nx*ny, dtype='f8')
    f_2d_C = f_1d.reshape((nx,ny), order='C')
    f_2d_F = f_1d.reshape((nx,ny), order='F')

    print ''
    print 'f_1d_C\n\n', f_1d
    print 'f_2d_C\n', f_2d_C
    print 'f_2d_F\n', f_2d_F

    print ''
    print 'after cuda'
    ret_f_1d = np.zeros_like(f_1d)
    f_1d_gpu = cuda.mem_alloc_like(f_1d)

    f_2d_C_gpu = cuda.to_device(f_2d_C)
    compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_C\n', ret_f_1d

    f_2d_F_gpu = cuda.to_device(f_2d_F)
    compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_F\n', ret_f_1d
开发者ID:wbkifun,项目名称:my_stuff,代码行数:31,代码来源:compare_order_C_F.py


示例19: calc_psd

    def calc_psd(self,bitloads,xtalk):
        #Number of expected permutations
        Ncombinations=self.K
        
        #Check if this is getting hairy and assign grid/block dimensions
        (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)

        #How many individual lk's
        memdim=blockCount*threadCount

        threadshare_grid=(blockCount,1)
        threadshare_block=(threadCount,1,1)
        
        #Memory (We get away with the NCombinations because calpsd checks against it)
        d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
        d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
        d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
        d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
        h_p=np.zeros((self.K,self.N)).astype(self.type)
        cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
        cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
        #Go solve
        #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){

        self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
        cuda.Context.synchronize()
        cuda.memcpy_dtoh(h_p,d_p)
        d_a.free()
        d_bitload.free()
        d_XTG.free()
        d_p.free()
        return h_p.astype(np.float64)
开发者ID:andrewbolster,项目名称:multiuserDSM,代码行数:32,代码来源:gpu.py


示例20: test_pycuda

 def test_pycuda(self):
     """
     Test pycuda installation with small example.
     :return:
     :rtype:
     """
     try:
         import pycuda.driver as cuda
         import pycuda.autoinit
         from pycuda.compiler import SourceModule
         import numpy as np
         a = np.random.randn(4, 4)
         print(a)
         a= a.astype(np.float32)
         a_gpu = cuda.mem_alloc(a.nbytes)
         cuda.memcpy_htod(a_gpu, a)
         mod = SourceModule(
             """
             __global__ void doublify(float *a)
             {
             int idx = threadIdx.x + threadIdx.y*4;
             a[idx] *= 2;
             }
             """
         )
         func = mod.get_function("doublify")
         func(a_gpu, block=(4,4,1))
         a_doubled = np.empty_like(a)
         cuda.memcpy_dtoh(a_doubled, a_gpu)
         #print(a_doubled)
         #print(a)
     except Exception:
         self.fail('Still not working')
开发者ID:SpikingNeurons,项目名称:WriterIdentification,代码行数:33,代码来源:unit_tests.py



注:本文中的pycuda.driver.memcpy_dtoh函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python driver.memcpy_dtoh_async函数代码示例发布时间:2022-05-25
下一篇:
Python driver.memcpy_dtod函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap