• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python driver.from_device函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.driver.from_device函数的典型用法代码示例。如果您正苦于以下问题:Python from_device函数的具体用法?Python from_device怎么用?Python from_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了from_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: get_phir_gpu

def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    MV = numpy.zeros(len(XK))
    L = numpy.sqrt(2*surface.Area) # Representative length
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w    = getWeights(K)
    X_V = numpy.zeros(N*K)
    X_Kx = numpy.zeros(N*K)
    X_Ky = numpy.zeros(N*K)
    X_Kz = numpy.zeros(N*K)
    X_Kc = numpy.zeros(N*K)
    X_Vc = numpy.zeros(N*K)

    for i in range(N*K):
        X_V[i]   = XV[i/K]*w[i%K]*surface.Area[i/K]
        X_Kx[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
        X_Ky[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
        X_Kz[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
        X_Kc[i]  = XK[i/K]
        X_Vc[i]  = XV[i/K]

    toc = time.time()
    time_set = toc - tic
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu   = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu  = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu  = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu  = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))


    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))

    get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, 
            surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, 
            numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
             AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
开发者ID:cdcooper84,项目名称:pygbe,代码行数:59,代码来源:projection.py


示例2: send

def send(target, nx, ny, nz, fx_gpu, fy_gpu):
	if target < myrank: 
		offset_fx = int(fx_gpu)
		offset_fy = int(fy_gpu)
	else: 
		offset_fx = int(fx_gpu) + nx*ny*(nz-1)*nof
		offset_fy = int(fy_gpu) + nx*ny*(nz-1)*nof

	mpi.world.send(target, 0, cuda.from_device(offset_fx, (nx,ny), np.float32))
	mpi.world.send(target, 1, cuda.from_device(offset_fy, (nx,ny), np.float32))
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:10,代码来源:035-block1d-texture-smem-if-over_maxgrid-mpi-cpu.py


示例3: exchange

def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
	ctx1 = cuda.Device(dev1).make_context()
	a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
	ctx1.pop()

	ctx2 = cuda.Device(dev2).make_context()
	cuda.memcpy_htod(int(b_gpu), a)
	b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
	ctx2.pop()

	ctx1 = cuda.Device(dev1).make_context()
	cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
	ctx1.pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:13,代码来源:03-3GPU.py


示例4: train_gpu

    def train_gpu(self, num_iter, model_file_path):
        if self.batch == 0:
            # Prepare to send the numpy array to gpu
            self.syn1_gpu = cuda.to_device(self.syn1)
            # Create word idx and related data-structure.
            self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
            word_rep_ptr = int(self.base_word_rep)
            self.word_reps = {}
            for w_idx, word in sorted(self.dictionary.items()):
                word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
                word_point = self.words_rep[word][1].astype(dtype=np.int32)
                self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
                word_rep_ptr += WordRep.memsize
            print "GPU transfers done."


        self.sent_reps_gpu = cuda.to_device(self.sent_reps)
        # Prepare sentences for GPU transfer.
        idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
                         for sentence in self.sentences]

        # Prepare the kernel function
        kernel = self.kernel_str.get_function("train_sg")
        words = np.empty(self.num_sents, dtype=np.int32)
        # sent_reps = np.copy(self.sent_reps)
        for iter in range(num_iter):
            # Sample words for each sentence and transfer to GPU
            for s_idx in range(self.num_sents):
                words[s_idx] = random.choice(idx_sentences[s_idx])
            words_gpu = cuda.to_device(words)
            kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
                   block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
            # autoinit.context.synchronize()
        self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
        pickle_dump(self.sent_reps, model_file_path)
开发者ID:ustbliubo2014,项目名称:DeepLearn,代码行数:35,代码来源:paragraph_vector.py


示例5: send

 def send(s, rank, tag_mark, direction):
     if direction == "f":
         offset_gpu = int(s.arr_gpu) + s.ny * nof
     elif direction == "b":
         offset_gpu = int(s.arr_gpu) + (s.nx - 2) * s.ny * nof
     print type(offset_gpu)
     comm.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny,), s.dtype))
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:7,代码来源:50-3GPU-mpi4py.py


示例6: test_stub

        def test_stub(shift, trials=10, rounds=1):
            # Run once so that evt_a doesn't include initialization time
            sorter.multisort(dout_a, dout_b, dkeys, count, shift,
                             rounds, stream=stream)
            evt_a = cuda.Event().record(stream)
            for i in range(trials):
                buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift,
                             rounds, stream=stream)
            evt_b = cuda.Event().record(stream)
            evt_b.synchronize()
            dur = evt_b.time_since(evt_a) / (rounds * trials)
            print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000),
                    count * sorter.radix_bits / (dur * 32 * 1000))

            if shift == 0 and correctness:
                print '\nTesting correctness'
                out = cuda.from_device(buf, (count,), np.uint32)
                sort = np.sort(keys)
                if np.all(out == sort):
                    print 'Correct'
                else:
                    nz = np.nonzero(out != sort)[0]
                    print sorted(set(nz >> 13))
                    for i in nz:
                        print i, out[i-1:i+2], sort[i-1:i+2]
                    assert False, 'Oh no'
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:26,代码来源:sort.py


示例7: get_from_device

    def get_from_device(self, index_list=None):
        '''
        Copy array data from GPU device and wrap in a numpy arrays.

        If index_list is None, return list of numpy arrays (one/array).
        If index_list is a single integer, return single numpy array.
        If index_list is an iterable, list of numpy arrays
                (one/selected array).
        '''
        single = False
        if index_list is None:
            index_list = range(len(self.data))
        else:
            try:
                int(index_list)
                index_list = [index_list]
                single = True
            except TypeError:
                pass
        results = []
        try:
            for i in index_list:
                results.append(cuda.from_device(self.data[i], self.shapes[i],
                            self.dtypes[i]))
        except cuda.LaunchError:
            import traceback
            traceback.print_exc()
            traceback.print_stack()
            raise ValueError, 'Invalid device pointer: %d' % i
        if single:
            return results[0]
        else:
            return results
开发者ID:cfobel,项目名称:pycuda_helpers,代码行数:33,代码来源:cuda.py


示例8: evaluate

  def evaluate(self, params, returnOutputs=False):
    """Evaluate several networks (with given params) on training set.
    
    @param params: network params
    @type params: list of Parameters
    @param returnOutputs: return network output values (debug)
    @type returnOutputs: bool, default False
    
    @return output matrix if returnOutputs=True, else None
    """
    if self.popSize != len(params):
      raise ValueError("Need %d Parameter structures (provided %d)" % (
        self.popSize, len(params)))
    
    paramArrayType = Parameters * len(params)
    driver.memcpy_htod(self.params, paramArrayType(*params))

    # TODO: remove
    driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
    
    self.evaluateKernel.prepared_call(self.evaluateGridDim,
                                      self.trainSetDev,
                                      self.trainSet.size,
                                      self.params,
                                      self.popSize,
                                      self.outputs)

    driver.Context.synchronize()

    self.outputsMat = driver.from_device(self.outputs,
                                         shape=(self.popSize, self.trainSet.size),
                                         dtype=np.float32)
    
    if returnOutputs:
      return self.outputsMat
开发者ID:cpatulea,项目名称:evolution,代码行数:35,代码来源:ann.py


示例9: print_arr_gpus

def print_arr_gpus(s):
	s.send_result()
	if mpi.rank == 0: 
		result = cuda.from_device(s.arr_gpu,s.shape,s.dtype)
		for i in range(1,ngpu): 
			result = np.concatenate((result,mpi.world.recv(i,10)))
		for i in xrange(s.ny):
			print result[:s.nx,i],'\t',result[s.nx:2*s.nx,i],'\t',result[2*s.nx:,i]
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:8,代码来源:20-3GPU-mpi-non_blocking.py


示例10: print_arr_gpus

def print_arr_gpus(ngpu, nx, ny, a_gpu):
	send_result(nx, ny, a_gpu)
	if mpi.rank == 0: 
		result = cuda.from_device(a_gpu, (nx,ny), 'float32')
		print ngpu
		for i in range(1,ngpu): 
			result = np.concatenate((result, mpi.world.recv(i,10)))
		for i in xrange(ny):
			print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:9,代码来源:010-mpi-exchange.py


示例11: P2P_gpu

def P2P_gpu(surfSrc, surfTar, m, mx, my, mz, mKc, mVc, K_gpu, V_gpu, 
            surf, LorY, K_diag, IorE, L, w, param, timing, kernel):

    tic = cuda.Event() 
    toc = cuda.Event() 

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mxDev  = cuda.to_device(mx.astype(REAL))
    myDev  = cuda.to_device(my.astype(REAL))
    mzDev  = cuda.to_device(mz.astype(REAL))
    mKcDev = cuda.to_device(mKc.astype(REAL))
    mVcDev = cuda.to_device(mVc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    direct_gpu = kernel.get_function("P2P")
    AI_int = cuda.to_device(zeros(param.Nround, dtype=int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays

    # Check if internal or external to send correct singular integral
    if IorE==1:
        sglInt = surfSrc.sglInt_intDev
    else:
        sglInt = surfSrc.sglInt_extDev


    direct_gpu(K_gpu, V_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mxDev, myDev, mzDev, 
                mKcDev, mVcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, sglInt,
                surfSrc.vertexDev, int32(ptr_offset), int32(ptr_list), 
                int32(LorY), REAL(param.kappa), REAL(param.threshold),
                int32(param.BlocksPerTwig), int32(param.NCRIT), REAL(K_diag), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = zeros(param.Nround, dtype=int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return K_gpu, V_gpu
开发者ID:LEONOB2014,项目名称:pygbe,代码行数:57,代码来源:FMMutils.py


示例12: get_heartbeat

def get_heartbeat(d_lead, length, sampling_rate):
    # Kernel Parameters
    threads_per_block = 200
    num_blocks = length / threads_per_block


    # Get RR
    reduce_by = 32
    edge_signal = cuda.mem_alloc(4 * length)
    
    edge_detect(edge_signal, d_lead,
                grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    indecies = numpy.zeros(length / reduce_by).astype(numpy.int32)
    masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32))
    d_index = cuda.to_device(indecies)
    index_of_peak(d_index, masks, edge_signal,
                  grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by)

    # Allocate output
    # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32)
    dev_rr = cuda.mem_alloc(c_length * 4)

    num_blocks = (c_length / threads_per_block) + 1
    get_compact_rr(dev_rr,
                   cd_index,
                   numpy.int32(sampling_rate),
                   numpy.int32(c_length),
                   grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    clean_result(dev_rr, numpy.int32(120), numpy.int32(40),
                 numpy.int32(1), numpy.int32(c_length),
                 grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    moving_average_filter(dev_rr, c_length, 250)

    index = cuda.from_device(cd_index, (c_length,), numpy.int32)
    rr = cuda.from_device(dev_rr, (c_length,), numpy.int32)
    index[0] = index[1]

    return rr, index / float(sampling_rate * 3600)
开发者ID:cconklin,项目名称:ECE207Proj2,代码行数:43,代码来源:plotecg.py


示例13: lift

  def lift(self, n):
    """Returns (positive rate within n largest) / (overall positive rate) for
       each individual.
    
    @return list of counts, in order of individuals
    """
    self.countKernel.prepared_call(self.countGridDim,
                                   self.outputs,
                                   self.trainSet.size,
                                   len(self.trainSet.positives),
                                   self.popSize,
                                   self.thresholds,
                                   self.counts)
    
    driver.Context.synchronize()

    countsMat = driver.from_device(self.counts,
                                   shape=(self.popSize, self.countBlockDim[0]),
                                   dtype=np.uint32)
    #log.debug("counts %r: %s", countsMat.shape, str(countsMat))
    log.debug("count sum over threads: %s", str(countsMat.sum(axis=1)))
    
    self.countSums = countsMat.sum(axis=1)
    
    self.nlargestPositiveRate = np.float32(self.countSums) / n
    log.debug("positive rate (n largest outputs): %s", str(self.nlargestPositiveRate))
    
    overallPositiveRate = float(len(self.trainSet.positives)) / float(self.trainSet.size)
    log.debug("positive rate (overall): %.04f", overallPositiveRate)
    
    lifts = self.nlargestPositiveRate / overallPositiveRate
    
    sortedLifts = sorted(enumerate(lifts), key=lambda (i, l): l, reverse=True)
    topIndex, topLift = sortedLifts[0]
    
    topOutputs = self.outputsMat[topIndex]
    
    nans = np.sum(np.isnan(topOutputs))
    neginfs = np.sum(np.isneginf(topOutputs))
    posinfs = np.sum(np.isposinf(topOutputs))
    omin = np.nanmin(topOutputs)
    omax = np.nanmax(topOutputs)
    threshold = self.thresholdsMat[topIndex]
    
    """
    log.info("The top ANN's outputs are:")
    log.info(
      "  %.02f%% NaN, %.02f%% -inf, %.02f%% +inf, min %.02e, max %.02e, thresh %.02e",
      100.0 * nans / len(topOutputs),
      100.0 * neginfs / len(topOutputs),
      100.0 * posinfs / len(topOutputs),
      omin, omax, threshold)
    """
    
    return lifts
开发者ID:cpatulea,项目名称:evolution,代码行数:55,代码来源:ann2layer.py


示例14: nlargest_cpu

def nlargest_cpu(ann, n):
  """CPU implementation of nlargest."""
  outputs = driver.from_device(ann.outputs,
                               shape=(ann.popSize, ann.trainSize),
                               dtype=np.float32)

  thresholds = []
  for row in outputs:
    sortedRow = sorted(row, reverse=True)
    thresholds.append(sortedRow[n])

  return thresholds
开发者ID:cpatulea,项目名称:evolution,代码行数:12,代码来源:bench_nlargest.py


示例15: P2PKt_gpu

def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, 
            surf, LorY, w, param, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event() 
        toc = cuda.Event() 
    else:
        tic = Event()
        toc = Event()

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mKtcDev = cuda.to_device(mKtc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    directKt_gpu = kernel.get_function("P2PKt")
    AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays


    directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, 
                surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, 
                surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, 
                surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), 
                numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
                numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return Ktx_gpu, Kty_gpu, Ktz_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:52,代码来源:FMMutils.py


示例16: go_sort_old

def go_sort_old(count, stream=None):
    data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
    ddata = cuda.to_device(data)
    print 'Done seeding'

    grids = count / 8192
    pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
    dpfxs = cuda.to_device(pfxs)

    launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    #dsplit = cuda.to_device(pfxs)
    #launch('crappy_split', dpfxs, dsplit,
            #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
    #if not stream:
        #split = cuda.from_device_like(dsplit, pfxs)
        #split_ = cuda.from_device_like(dsplit_, pfxs)
        #print np.all(split == split_)

    dshortseg_pfxs = cuda.mem_alloc(256 * 4)
    dshortseg_sums = cuda.mem_alloc(256 * 4)
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            dshortseg_pfxs, dshortseg_sums,
            block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1)

    dsorted = cuda.mem_alloc(count * 4)
    launch('sort_8', ddata, dsorted, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    launch('sort_8_a', ddata, dsorted, dpfxs, dsplit,
            block=(32, 32, 1), grid=(grids, 1), stream=stream)
    if not stream:
        sorted = cuda.from_device(dsorted, (count,), np.int32)
        f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
        sort_stat = f(rle(sorted))
        with open('dev.txt', 'w') as fp: fp.write(sort_stat)

        sorted_np = np.sort(data)
        np_stat = f(rle(sorted_np))
        with open('cpu.txt', 'w') as fp: fp.write(np_stat)

        print 'is_sorted?', np.all(sorted == sorted_np)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:47,代码来源:sortbench.py


示例17: test_mwc

def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import time

    nthreads = blockwidth * nblocks
    seeds = make_seeds(nthreads, host_seed=42)
    dseeds = cuda.to_device(seeds)

    mod = SourceModule(assemble_code(mwctestlib))

    for trial in range(2):
        print "Trial %d, on CPU: " % trial,
        sums = np.zeros(nthreads, dtype=np.uint64)
        ctime = time.time()
        mults = seeds[0].astype(np.uint64)
        states = seeds[1]
        carries = seeds[2]
        for i in range(rounds):
            step = np.frombuffer((mults * states + carries).data,
                       dtype=np.uint32).reshape((2, nthreads), order='F')
            states[:] = step[0]
            carries[:] = step[1]
            sums += states

        ctime = time.time() - ctime
        print "Took %g seconds." % ctime

        print "Trial %d, on device: " % trial,
        dsums = cuda.mem_alloc(8*nthreads)
        fun = mod.get_function("test_mwc")
        dtime = fun(dseeds, dsums, np.float32(rounds),
                    block=(blockwidth,1,1), grid=(nblocks,1),
                    time_kernel=True)
        print "Took %g seconds." % dtime
        dsums = cuda.from_device(dsums, nthreads, np.uint64)
        if not np.all(np.equal(sums, dsums)):
            print "Sum discrepancy!"
            print sums
            print dsums
开发者ID:vincentmele,项目名称:cuburn,代码行数:40,代码来源:mwc.py


示例18: show

	def show(s, a):
		print a
		print cuda.from_device( s.arr_gpu, (6,5), np.float32 ).T
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:3,代码来源:boostmpi-multiprocessing-02.py


示例19: send

	def send(s,rank,tag_mark):
		a = cuda.from_device(s.arr_gpu,(6,5),np.float32)
		mpi.world.send( rank,tag_mark, a)
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:3,代码来源:boostmpi-multiprocessing-02.py


示例20: _print_interp_knots

 def _print_interp_knots(self, rdr, tsidx=5):
     infos = cuda.from_device(self.info_a.d_params,
             (tsidx + 1, len(rdr.packer)), f32)
     for i, n in zip(infos[-1], rdr.packer.packed):
         print '%60s %g' % ('_'.join(n), i)
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:5,代码来源:render.py



注:本文中的pycuda.driver.from_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python driver.init函数代码示例发布时间:2022-05-25
下一篇:
Python driver.dtype_to_array_format函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap