• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python driver.to_device函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.driver.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了to_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_compare_order

def test_compare_order():
    '''
    compare_order between C(row-major), F(column-major)
    '''
    compare_order = mod_cu.get_function('compare_order')


    nx, ny = 3, 4
    f_1d = np.arange(nx*ny, dtype='f8')
    f_2d_C = f_1d.reshape((nx,ny), order='C')
    f_2d_F = f_1d.reshape((nx,ny), order='F')

    print ''
    print 'f_1d_C\n\n', f_1d
    print 'f_2d_C\n', f_2d_C
    print 'f_2d_F\n', f_2d_F

    print ''
    print 'after cuda'
    ret_f_1d = np.zeros_like(f_1d)
    f_1d_gpu = cuda.mem_alloc_like(f_1d)

    f_2d_C_gpu = cuda.to_device(f_2d_C)
    compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_C\n', ret_f_1d

    f_2d_F_gpu = cuda.to_device(f_2d_F)
    compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_F\n', ret_f_1d
开发者ID:wbkifun,项目名称:my_stuff,代码行数:31,代码来源:compare_order_C_F.py


示例2: multiply_csr

def multiply_csr(matrix, vector, block_size, repeat=1):
    '''
    Method multiply matrix by vector using CUDA module for CSR.
    Calculation executed on nVidia GPU.

    Parameters
    ==========
    matrix : Scipy matrix or numpy array
        Matrix to multiplication.
    vector : numpy array
        Vector to multiplication. His length must equal number of columns
        matrix.
    block_size : int (recommended 128 or 256)
        Size of block CUDA.
    repeat : int > 0
        Number of repetitions multiplications. It has no effect on
        result. Specifies the length of returned list of execution times.

    Returns
    =======
    Tuple of result multiplication and list of execution times.
    '''
    if len(vector) != matrix.shape[1]:
        raise ArithmeticError('Length of the vector is not equal to the'
                              'number of columns of the matrix.')
    matrix = mf.convert_to_scipy_csr(matrix)
    data = numpy.array(matrix.data, dtype=numpy.float32)
    indices = numpy.array(matrix.indices, dtype=numpy.int32)
    indptr = numpy.array(matrix.indptr, dtype=numpy.int32)
    data = cuda.to_device(data)
    indices = cuda.to_device(indices)
    indptr = cuda.to_device(indptr)
    num_rows = matrix.shape[0]
    result = numpy.zeros(num_rows, dtype=numpy.float32)
    time_list = []

    grid_size = int(numpy.ceil((num_rows+0.0)/block_size))
    block = (block_size, 1, 1)
    grid = (grid_size, 1)
    g_vector = cuda.to_device(vector)
    num_rows = numpy.int32(num_rows)

    kernel, texref = cudacodes.get_cuda_csr(block_size=block_size)
    texref.set_address(g_vector, vector.nbytes)
    tex = [texref]

    for _ in range(repeat):
        start.record()
        kernel(data,
               indices,
               indptr,
               cuda.Out(result),
               num_rows,
               block=block,
               grid=grid,
               texrefs=tex)
        end.record()
        end.synchronize()
        time_list.append(start.time_till(end))
    return (result, time_list)
开发者ID:fivitti,项目名称:SMDV,代码行数:60,代码来源:matrixmultiplication.py


示例3: allocation

 def allocation(self):
     super(DGModalGpu, self).allocation()
     self.ul_gpu = cuda.to_device(self.ul)
     self.ul_prev_gpu = cuda.to_device(self.ul)
     self.ul_tmp_gpu = cuda.to_device(self.ul)
     self.kl_gpu = cuda.to_device(self.ul)
     self.el_sum_gpu = cuda.to_device(np.zeros(self.ne))
开发者ID:wbkifun,项目名称:my_research,代码行数:7,代码来源:dg_modal_gpu.py


示例4: test_simple_kernel_2

    def test_simple_kernel_2(self):
        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        a = np.random.randn(400).astype(np.float32)
        b = np.random.randn(400).astype(np.float32)
        a_gpu = drv.to_device(a)
        b_gpu = drv.to_device(b)

        dest = np.zeros_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=(400, 1, 1))
        assert la.norm(dest-a*b) == 0

        drv.Context.synchronize()
        # now try with offsets
        dest = np.zeros_like(a)
        multiply_them(
                drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu,
                block=(399, 1, 1))

        assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:30,代码来源:test_driver.py


示例5: train_gpu

    def train_gpu(self, num_iter, model_file_path):
        if self.batch == 0:
            # Prepare to send the numpy array to gpu
            self.syn1_gpu = cuda.to_device(self.syn1)
            # Create word idx and related data-structure.
            self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
            word_rep_ptr = int(self.base_word_rep)
            self.word_reps = {}
            for w_idx, word in sorted(self.dictionary.items()):
                word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
                word_point = self.words_rep[word][1].astype(dtype=np.int32)
                self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
                word_rep_ptr += WordRep.memsize
            print "GPU transfers done."


        self.sent_reps_gpu = cuda.to_device(self.sent_reps)
        # Prepare sentences for GPU transfer.
        idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
                         for sentence in self.sentences]

        # Prepare the kernel function
        kernel = self.kernel_str.get_function("train_sg")
        words = np.empty(self.num_sents, dtype=np.int32)
        # sent_reps = np.copy(self.sent_reps)
        for iter in range(num_iter):
            # Sample words for each sentence and transfer to GPU
            for s_idx in range(self.num_sents):
                words[s_idx] = random.choice(idx_sentences[s_idx])
            words_gpu = cuda.to_device(words)
            kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
                   block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
            # autoinit.context.synchronize()
        self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
        pickle_dump(self.sent_reps, model_file_path)
开发者ID:ustbliubo2014,项目名称:DeepLearn,代码行数:35,代码来源:paragraph_vector.py


示例6: get_phir_gpu

def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    MV = numpy.zeros(len(XK))
    L = numpy.sqrt(2*surface.Area) # Representative length
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w    = getWeights(K)
    X_V = numpy.zeros(N*K)
    X_Kx = numpy.zeros(N*K)
    X_Ky = numpy.zeros(N*K)
    X_Kz = numpy.zeros(N*K)
    X_Kc = numpy.zeros(N*K)
    X_Vc = numpy.zeros(N*K)

    for i in range(N*K):
        X_V[i]   = XV[i/K]*w[i%K]*surface.Area[i/K]
        X_Kx[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
        X_Ky[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
        X_Kz[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
        X_Kc[i]  = XK[i/K]
        X_Vc[i]  = XV[i/K]

    toc = time.time()
    time_set = toc - tic
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu   = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu  = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu  = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu  = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))


    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))

    get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, 
            surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, 
            numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
             AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
开发者ID:cdcooper84,项目名称:pygbe,代码行数:59,代码来源:projection.py


示例7: __init__

 def __init__(self, code, point, struct_ptr):
     self.code = cuda.to_device(code)
     self.point = cuda.to_device(point)
     self.code_shape, self.code_dtype = code.shape, code.dtype
     self.point_shape, self.point_dtype = point.shape, point.dtype
     cuda.memcpy_htod(int(struct_ptr), np.int32(code.size))
     cuda.memcpy_htod(int(struct_ptr) + 8, np.intp(int(self.code)))
     cuda.memcpy_htod(int(struct_ptr) + 8 + np.intp(0).nbytes, np.intp(int(self.point)))
开发者ID:Huskyeder,项目名称:ParagraphVec,代码行数:8,代码来源:paragraph_vector.py


示例8: sync_to_device

 def sync_to_device(self):
     self.object_array = np.array([f.as_array()
             for f in self.object_list])
     self.d_object_array = cuda.to_device(self.object_array)
     self.d_object_count = cuda.to_device(np.array([self.object_count],
                                                dtype=np.int32))
     self.device_ptr = cuda.to_device(np.array([self.d_object_array,
                                                self.d_object_count],
                                               dtype=np.intp))
     return self.device_ptr
开发者ID:cfobel,项目名称:pycuda_helpers,代码行数:10,代码来源:struct_container.py


示例9: nlargest

  def nlargest(self, n):
    """Returns the per-individual threshold above which there are n outputs.
    
    @param n: number of outputs which should be above the threshold
    @type params: int

    @return list of thresholds, in order of individuals, which delimit the top
            n output values
    """
    log.debug("enter nlargest with n=%d", n)

    # Find one more output so that we can use strictly-less-than when counting
    # and underestimate lift rather than overestimating it.
    n = n + 1

    passSizes = []
    while n > 0:
      nextSize = min(self.maxHeapFloats, n)
      passSizes.append(nextSize)
      n -= nextSize

    log.debug("pass sizes: %r", passSizes)
    
    thresholdsMat = np.ones(shape=(self.popSize,),
                            dtype=np.float32) * np.inf
    self.thresholds = driver.to_device(thresholdsMat)

    uintBytes = np.dtype(np.uint32).itemsize
    thresholdCounts = np.zeros(shape=(self.popSize,),
                               dtype=np.uint32)
    self.thresholdCounts = driver.to_device(thresholdCounts)

    for passSize in passSizes:
      log.debug("begin pass size %d", passSize)
      self.nlargestKernel.prepared_call(self.nlargestGridDim,
                                        self.outputs,
                                        self.trainSet.size,
                                        self.popSize,
                                        passSize,
                                        self.thresholds,
                                        self.thresholdCounts)

      driver.Context.synchronize()

      if log.isEnabledFor(logging.DEBUG):
        thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
        log.debug("thresholds: %s", str(thresholdsMat))
        
        thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts)
        log.debug("thresholdCounts: %s", str(thresholdCounts))

    self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
    return self.thresholdsMat
开发者ID:cpatulea,项目名称:evolution,代码行数:53,代码来源:ann.py


示例10: P2PKt_gpu

def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, 
            surf, LorY, w, param, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event() 
        toc = cuda.Event() 
    else:
        tic = Event()
        toc = Event()

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mKtcDev = cuda.to_device(mKtc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    directKt_gpu = kernel.get_function("P2PKt")
    AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays


    directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, 
                surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, 
                surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, 
                surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), 
                numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
                numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return Ktx_gpu, Kty_gpu, Ktz_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:52,代码来源:FMMutils.py


示例11: K

 def K(self, Q, P, angles, quadratures):
     drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
     drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
     Nx = Q.shape[0]
     Ny = int(floor(quadratures.size / 1024.))
     K = scipy.empty((Nx,), dtype=scipy.float32)
     Kb = drv.mem_alloc(4*Ny*Nx)
     Q_gpu = drv.to_device(Q)
     P_gpu = drv.to_device(P)
     self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
                block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
     self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
     return K/self.L
开发者ID:martina88esposito,项目名称:tomohowk,代码行数:13,代码来源:tomography_cuda.py


示例12: __init__

    def __init__(self):
        self.stream = cuda.Stream()
        self.pool = pycuda.tools.PageLockedMemoryPool()
        self._clear()

        # These resources rely on the slots/ringbuffer mechanism for sharing,
        # and so can be shared across any number of launches, genomes, and
        # render kernels. Notably, seeds are self-synchronizing, so they're not
        # attached to either stream object.
        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
        self.d_seeds = cuda.to_device(seeds)
        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
        self.d_points = cuda.mem_alloc(self._len_d_points)
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:14,代码来源:render.py


示例13: M2P_gpu

def M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, surf, ind0, param, LorY, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL

    tic.record()
    M2P_size = surfTar.offsetMlt[surf,len(surfTar.twig)]
    MSort  = numpy.zeros(param.Nm*M2P_size)
    MdSort = numpy.zeros(param.Nm*M2P_size)

    i = -1
    for C in surfTar.M2P_list[surf,0:M2P_size]:
        i+=1
        MSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].M
        MdSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].Md

#    (free, total) = cuda.mem_get_info()
#    print 'Global memory occupancy: %f%% free'%(free*100/total)
    MDev = cuda.to_device(MSort.astype(REAL))
    MdDev = cuda.to_device(MdSort.astype(REAL))
#    (free, total) = cuda.mem_get_info()
#    print 'Global memory occupancy: %f%% free'%(free*100/total)

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays

    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    multipole_gpu = kernel.get_function("M2P")

    multipole_gpu(K_gpu, V_gpu, surfTar.offMltDev, surfTar.sizeTarDev,  
                    surfTar.xcDev, surfTar.ycDev, surfTar.zcDev,
                    MDev, MdDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, 
                    ind0.indexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), REAL(param.kappa), 
                    numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), numpy.int32(LorY), 
                    block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_M2P += tic.time_till(toc)*1e-3

    return K_gpu, V_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:48,代码来源:FMMutils.py


示例14: batch_indexing

    def batch_indexing(self, planes, data_points):

        data_size = data_points.shape[0] / 128

        self.benchmark_begin('preparing')

        gpu_alloc_objs = []

        # for data points

        #addresses = [] 
        #for point in data_points:
        #    point_addr = drv.to_device(point)
        #    gpu_alloc_objs.append(point_addr)
        #    addresses.append(int(point_addr))

        #np_addresses = numpy.array(addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        #arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8)
        #drv.memcpy_htod(arrays_gpu, np_addresses)

        # for planes

        planes_addresses = [] 
        for plane in planes:
            plane_addr = drv.to_device(plane)
            gpu_alloc_objs.append(plane_addr)
            planes_addresses.append(int(plane_addr))

        planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8)
        drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses)

        # projections
 
        projections = numpy.zeros(data_size).astype(numpy.uint64)

        length = numpy.array([data_size]).astype(numpy.uint64)
 
        print "total: " + str(data_size) + " data points to indexing." 

        self.benchmark_end('preparing')
        self.benchmark_begin('cudaing')

        self.indexing_kernel(
            planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length),
            block = self.block, grid = self.grid)
        
        self.benchmark_end('cudaing')

        #count = 0
        #for pro in projections:
        #    print "count: " + str(count) + " " + str(pro)
        #    count += 1
        #print projections.shape

        return projections
开发者ID:viirya,项目名称:fastdict,代码行数:60,代码来源:cuda_indexing.py


示例15: index_list_backend

    def index_list_backend(self, ilists):
        from pytools import single_valued

        ilist_length = single_valued(len(il) for il in ilists)
        assert ilist_length == self.plan.dofs_per_face

        from cgen import Typedef, POD

        from pytools import flatten

        flat_ilists_uncast = numpy.array(list(flatten(ilists)))

        if numpy.max(flat_ilists_uncast) >= 256:
            tp = numpy.uint16
        else:
            tp = numpy.uint8

        flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
        assert (flat_ilists == flat_ilists_uncast).all()

        return GPUIndexLists(
            type=tp,
            code=[Typedef(POD(tp, "index_list_entry_t"))],
            device_memory=cuda.to_device(flat_ilists),
            bytes=flat_ilists.size * flat_ilists.itemsize,
        )
开发者ID:gimac,项目名称:hedge,代码行数:26,代码来源:fluxgather.py


示例16: test_multichannel_linear_texture

    def test_multichannel_linear_texture(self):
        mod = SourceModule("""
        #define CHANNELS 4
        texture<float4, 1, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int i = threadIdx.x+blockDim.x*threadIdx.y;
          float4 texval = tex1Dfetch(mtx_tex, i);
          dest[i*CHANNELS + 0] = texval.x;
          dest[i*CHANNELS + 1] = texval.y;
          dest[i*CHANNELS + 2] = texval.z;
          dest[i*CHANNELS + 3] = texval.w;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (16, 16)
        channels = 4
        a = np.random.randn(*(shape+(channels,))).astype(np.float32)
        a_gpu = drv.to_device(a)
        mtx_tex.set_address(a_gpu, a.nbytes)
        mtx_tex.set_format(drv.array_format.FLOAT, 4)

        dest = np.zeros(shape+(channels,), dtype=np.float32)
        copy_texture(drv.Out(dest),
                block=shape+(1,),
                texrefs=[mtx_tex]
                )
        #print a
        #print dest
        assert la.norm(dest-a) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:34,代码来源:test_driver.py


示例17: cls_init

    def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n):
        """
        Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2.
         
        Parameters
        ------------
        kernel_nr : int
            concurrent kernel number
        y_cls : array-like
            binary class labels (1,-1)
        cls1: int
            first class number
        cls2: int
            second class number
        cls1_n : int
            number of elements of class 1
        cls2_n : int
            number of elements of class 2
        kernel_out : array-like
            array for gpu kernel result, size=2*len(y_cls)
        
        """
        warp=32
        align_cls1_n =  cls1_n+(warp-cls1_n%warp)%warp
        align_cls2_n =  cls2_n+(warp-cls2_n%warp)%warp
        
        self.cls1_N_aligned=align_cls1_n

        sum_cls= align_cls1_n+align_cls2_n   
        self.sum_cls[kernel_nr] = sum_cls
              
        
        self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32)
        self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32)  
        
        self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr])
        
        self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr])
        
        self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb ))
        
        self.g_y[kernel_nr] =  cuda.to_device(y_cls)
        
        self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32)
        
        ker_out = self.kernel_out[kernel_nr]      
        self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out)
开发者ID:ksopyla,项目名称:pyKMLib,代码行数:47,代码来源:GPUKernels.py


示例18: go_sort_old

def go_sort_old(count, stream=None):
    data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
    ddata = cuda.to_device(data)
    print 'Done seeding'

    grids = count / 8192
    pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
    dpfxs = cuda.to_device(pfxs)

    launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    #dsplit = cuda.to_device(pfxs)
    #launch('crappy_split', dpfxs, dsplit,
            #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
    #if not stream:
        #split = cuda.from_device_like(dsplit, pfxs)
        #split_ = cuda.from_device_like(dsplit_, pfxs)
        #print np.all(split == split_)

    dshortseg_pfxs = cuda.mem_alloc(256 * 4)
    dshortseg_sums = cuda.mem_alloc(256 * 4)
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            dshortseg_pfxs, dshortseg_sums,
            block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1)

    dsorted = cuda.mem_alloc(count * 4)
    launch('sort_8', ddata, dsorted, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    launch('sort_8_a', ddata, dsorted, dpfxs, dsplit,
            block=(32, 32, 1), grid=(grids, 1), stream=stream)
    if not stream:
        sorted = cuda.from_device(dsorted, (count,), np.int32)
        f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
        sort_stat = f(rle(sorted))
        with open('dev.txt', 'w') as fp: fp.write(sort_stat)

        sorted_np = np.sort(data)
        np_stat = f(rle(sorted_np))
        with open('cpu.txt', 'w') as fp: fp.write(np_stat)

        print 'is_sorted?', np.all(sorted == sorted_np)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:47,代码来源:sortbench.py


示例19: make_superblocks

def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
    from hedge.backends.cuda.tools import pad_and_join

    # single_item = [([ block1, block2, ... ], decl), ...]
    # multi_item = [([ [ item1, item2, ...], ... ], decl), ...]

    multi_blocks = [
            ["".join(s) for s in part_data]
            for part_data, part_decls in multi_item]
    block_sizes = [
            max(len(b) for b in part_blocks)
            for part_blocks in multi_blocks]

    from pytools import single_valued
    block_count = single_valued(
            len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)

    from cgen import Struct, ArrayOf

    struct_members = []
    for part_data, part_decl in single_item:
        assert block_count == len(part_data)
        single_valued(len(block) for block in part_data)
        struct_members.append(part_decl)

    for part_data, part_decl in multi_item:
        struct_members.append(
                ArrayOf(part_decl, max(len(s) for s in part_data)))

    superblocks = []
    for superblock_num in range(block_count):
        data = ""
        for part_data, part_decl in single_item:
            data += part_data[superblock_num]

        for part_blocks, part_size in zip(multi_blocks, block_sizes):
            assert block_count == len(part_blocks)
            data += pad(part_blocks[superblock_num], part_size)

        superblocks.append(data)

    superblock_size = devdata.align(
            single_valued(len(sb) for sb in superblocks))

    data = pad_and_join(superblocks, superblock_size)
    assert len(data) == superblock_size*block_count

    class SuperblockedDataStructure(Record):
        pass

    return SuperblockedDataStructure(
            struct=Struct(struct_name, struct_members),
            device_memory=cuda.to_device(data),
            block_bytes=superblock_size,
            data=data,
            **extra_fields
            )
开发者ID:paulcazeaux,项目名称:hedge,代码行数:57,代码来源:tools.py


示例20: set_refsmiles

    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
开发者ID:ihaque,项目名称:SIML,代码行数:56,代码来源:GPULingo.py



注:本文中的pycuda.driver.to_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python driver.Context类代码示例发布时间:2022-05-25
下一篇:
Python driver.pagelocked_zeros函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap