• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python gpuarray.empty函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.gpuarray.empty函数的典型用法代码示例。如果您正苦于以下问题:Python empty函数的具体用法?Python empty怎么用?Python empty使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了empty函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _minmax_impl

def _minmax_impl(a_gpu, axis, min_or_max, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.'''
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                   block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
    return target, idx
开发者ID:oursland,项目名称:scikits.cuda,代码行数:32,代码来源:misc.py


示例2: generate_shifts_2d

def generate_shifts_2d(width, height, n_samples, with_hot=False):
    x_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (width - 0.01)
    x_shifts = x_shifts.astype(np.uint32)

    y_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (height - 0.01)
    y_shifts = y_shifts.astype(np.uint32)

    if with_hot:
        shifts_hot = gp.empty((width * height, n_samples), np.float32)
        threads_per_block = 32
        n_blocks = int(math.ceil(n_samples / threads_per_block))
        gpu_shift_to_hot_2d(x_shifts, y_shifts, shifts_hot,
                            np.uint32(shifts_hot.strides[0]/4),
                            np.uint32(shifts_hot.strides[1]/4),
                            np.uint32(width), np.uint32(height), np.uint32(n_samples),
                            block=(threads_per_block, 1, 1), grid=(n_blocks, 1))
        return x_shifts, y_shifts, shifts_hot
    else:
        shifts = gp.empty((2, n_samples), np.float32)
        threads_per_block = 32
        n_blocks = int(math.ceil(n_samples / threads_per_block))
        gpu_vstack(y_shifts, x_shifts, shifts,
                   np.uint32(shifts.strides[0]/4), np.uint32(shifts.strides[1]/4),
                   np.uint32(n_samples),
                   block=(threads_per_block, 1, 1), grid=(n_blocks, 1))
        return x_shifts, y_shifts, shifts
开发者ID:surban,项目名称:ml,代码行数:26,代码来源:shift_gpu.py


示例3: sample_dropout_mask

def sample_dropout_mask(x, dropout_probability=.5, columns=None, stream=None, target=None,
                        dropout_mask=None, dropout_prob_array=None):
    """ Samples a dropout mask and applies it in place"""

    assert x.flags.c_contiguous

    if columns is not None:
        assert len(columns) == 2
        x_tmp = x
        x = extract_columns(x, columns[0], columns[1])

    shape = x.shape

    if dropout_prob_array is None:
        dropout_prob_array = gpuarray.empty(shape, x.dtype)
    sampler.fill_uniform(dropout_prob_array, stream)

    if dropout_mask is None:
        dropout_mask = gpuarray.empty(shape, np.int8)

    if target is None: target = x
    
    all_kernels['sample_dropout_mask'](
        x, target, dropout_mask, dropout_prob_array,
        np.float32(dropout_probability))

    if columns is not None:
        insert_columns(x, x_tmp, columns[0])

    return dropout_mask
开发者ID:Snazz2001,项目名称:hebel,代码行数:30,代码来源:elementwise.py


示例4: get

    def get(self, V_gpu, xcl_gpu, xcr_gpu, W_gpu, x_gpu, stream=None):
        """
        """
        if stream is None:
            stream = cuda.Stream()

        # Temporary variables
        z_gpu = gpuarray.empty((self.params['V_d'], 
                                self.params['V_w']), self.params['dtype'])
        xc2_gpu = gpuarray.empty(2*self.params['w_d'], self.params['dtype'])
            
        blockDim_x = self.params['V_w']
        
        self._func[0](xcl_gpu, xcr_gpu, xc2_gpu,
                      block=(blockDim_x, 1, 1),
                      stream=stream)            
        
        gridDim_z = self.params['V_d']
        blockDim_y = self.params['V_w']
        
        self._func[1](V_gpu, xc2_gpu, z_gpu,
                      block=(1, blockDim_y, 1),
                      grid=(1, 1, gridDim_z),
                      stream=stream)

        blockDim_y = self.params['W_h']
        
        self._func[2](W_gpu, xc2_gpu, z_gpu, x_gpu,
                      block=(1, blockDim_y, 1),
                      grid=(1, 1, 1),
                      stream=stream)                   
开发者ID:jolinxql,项目名称:RNTN,代码行数:31,代码来源:RNTN_cuda.py


示例5: enable3d

    def enable3d(self):
        self.point1 = self.point-(self.mesh_diagonal_norm/60)*self.axis2
        self.point2 = self.point+(self.mesh_diagonal_norm/60)*self.axis2

        self.viewing_angle = 0.0

        pos1, dir1 = from_film(self.point1, axis1=self.axis1, axis2=self.axis2,
                               size=self.size, width=self.film_width)
        pos2, dir2 = from_film(self.point2, axis1=self.axis1, axis2=self.axis2,
                               size=self.size, width=self.film_width)

        self.rays1 = gpu.GPURays(pos1, dir1,
                                 max_alpha_depth=self.max_alpha_depth)
        self.rays2 = gpu.GPURays(pos2, dir2,
                                 max_alpha_depth=self.max_alpha_depth)

        scope_size = (self.size[0]//4, self.size[0]//4)

        scope_pos, scope_dir = from_film(self.point, axis1=self.axis1,
                                         axis2=self.axis2, size=scope_size,
                                         width=self.film_width/4.0)

        self.scope_rays = gpu.GPURays(scope_pos, scope_dir)

        self.scope_pixels_gpu = ga.empty(self.scope_rays.pos.size, dtype=np.uint32)

        self.pixels1_gpu = ga.empty(self.width*self.height, dtype=np.uint32)
        self.pixels2_gpu = ga.empty(self.width*self.height, dtype=np.uint32)
        
        self.distances_gpu = ga.empty(self.scope_rays.pos.size,
                                      dtype=np.float32)
        self.display3d = True
开发者ID:BenLand100,项目名称:chroma,代码行数:32,代码来源:camera.py


示例6: __init__

    def __init__(self, res=(640, 480)):
        mod = cuda.SourceModule(file("cpp/trace.cu").read(), keep=True, options=['-I../cpp'], no_extern_c=True)
        self.InitEyeRays = mod.get_function("InitEyeRays")
        self.InitFishEyeRays = mod.get_function("InitFishEyeRays")
        self.Trace = mod.get_function("Trace")
        self.ShadeSimple = mod.get_function("ShadeSimple")
        self.mod = mod

        self.block = (16, 32, 1)  # 15: 32, 18: 28, 19: 24
        self.grid = ( res[0]/self.block[0], res[1]/self.block[1] )
        self.resx, self.resy = (self.grid[0]*self.block[0], self.grid[1]*self.block[1])

        self.smallblock = (16, 16, 1)
        self.smallgrid = ( res[0]/self.smallblock[0], res[1]/self.smallblock[1] )

        self.d_img = ga.empty( (self.resy, self.resx, 4), uint8 )

        '''
        struct RayData
        {
          float3 dir;
          float t;
          VoxNodeId endNode;
          int endNodeChild;
          float endNodeSize;
        };
        '''
        raySize = struct.calcsize("3f f i i f")
        self.d_rays = ga.empty( (self.resy, self.resx, raySize), uint8 )

        self.setLightPos((0.5, 0.5, 1))
        self.detailCoef = 10.0
开发者ID:znah,项目名称:yoxel-voxel,代码行数:32,代码来源:trace_cuda.py


示例7: test_cublas_bug

def test_cublas_bug():
    '''
    The SGEMM call would cause all calls after it to fail for some unknown
    reason. Likely this is caused swaprows causing memory corruption.

    NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be
          fixed in CUDA 6.5
    '''
    from pycuda.driver import Stream
    from skcuda.cublas import cublasSgemm
    from skcuda.misc import _global_cublas_handle as handle

    n = 131

    s = slice(128, n)
    X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32))
    a = gpuarray.empty((X.shape[1], 3), dtype=np.float32)
    c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32)
    b = gpuarray.empty_like(X)

    m, n = a.shape[0], b[s].shape[1]
    k = a.shape[1]
    lda = m
    ldb = k
    ldc = m
    #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc

    #gpuarray.dot(d, Xoutd[s])
    #op.sgemm(a, b[s], c)

    stream = Stream()
    stream.synchronize()
开发者ID:stachon,项目名称:binet,代码行数:34,代码来源:test_op.py


示例8: test_cublasSgetriBatched

    def test_cublasSgetriBatched(self):
        l,m = 11,7
        np.random.seed(1)
        A = np.random.rand(l,m, m).astype(np.float32)
        
        a_gpu = gpuarray.to_gpu(A)
        a_arr = bptrs(a_gpu)
        c_gpu = gpuarray.empty((l,m,m), np.float32)
        c_arr = bptrs(c_gpu)

        p_gpu = gpuarray.empty((l,m), np.int32)
        i_gpu = gpuarray.zeros(l, np.int32)

        cublas.cublasSgetrfBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, 
                    i_gpu.gpudata, l)

        cublas.cublasSgetriBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, c_arr.gpudata,m,
                    i_gpu.gpudata, l)
        
        X = np.array(map(np.linalg.inv,A))
        X_ = c_gpu.get()

        assert np.allclose(X,X_,6)
开发者ID:teodor-moldovan,项目名称:scikits.cuda,代码行数:25,代码来源:test_cublas.py


示例9: _create_halo_arrays

    def _create_halo_arrays(self):

        # Allocate space for the halos: two per face,
        # one for sending and one for receiving.

        nz, ny, nx = self.local_dims
        sw = self.stencil_width
        # create two halo regions for each face, one holding
        # the halo values to send, and the other holding
        # the halo values to receive.

        self.left_recv_halo = gpuarray.empty([nz,ny,sw], dtype=np.float64)
        self.left_send_halo = self.left_recv_halo.copy()
        self.right_recv_halo = self.left_recv_halo.copy()
        self.right_send_halo = self.left_recv_halo.copy()
    
        self.bottom_recv_halo = gpuarray.empty([nz,sw,nx], dtype=np.float64)
        self.bottom_send_halo = self.bottom_recv_halo.copy()
        self.top_recv_halo = self.bottom_recv_halo.copy()
        self.top_send_halo = self.bottom_recv_halo.copy()

        self.back_recv_halo = gpuarray.empty([sw,ny,nx], dtype=np.float64)
        self.back_send_halo = self.back_recv_halo.copy()
        self.front_recv_halo = self.back_recv_halo.copy()
        self.front_send_halo = self.back_recv_halo.copy()
开发者ID:shwina,项目名称:gpuDA,代码行数:25,代码来源:gpuda.py


示例10: gradient_gpu

def gradient_gpu(y_gpu, mode='valid'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))
  shared_size = int((1+block_size[0])*(1+block_size[1])*dtype.itemsize)

  preproc = _generate_preproc(dtype, shape)
  mod = SourceModule(preproc + kernel_code, keep=True)

  if mode == 'valid':
    gradient_gpu = mod.get_function("gradient_valid")

    gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]-1), y_gpu.dtype)
    grady_gpu = cua.empty((y_gpu.shape[0]-1, y_gpu.shape[1]), y_gpu.dtype)

  if mode == 'same':
    gradient_gpu = mod.get_function("gradient_same")

    gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
    grady_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
    
  gradient_gpu(gradx_gpu.gpudata, grady_gpu.gpudata, y_gpu.gpudata,
               block=block_size, grid=grid_size, shared=shared_size)

  return (gradx_gpu, grady_gpu)
开发者ID:matthiaslee,项目名称:VMBD,代码行数:28,代码来源:gputools.py


示例11: initializeGpuMemory

 def initializeGpuMemory(self):
     K = self.modelParams["proc_id_model","K"]
     
     # Sufficient statistics for the parameters of G kernels
     self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
     self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) 
     self.gpuPtrs["impulse_model","GS"] = gpuarray.empty_like(self.base.dSS["dS"])
开发者ID:richardkwo,项目名称:pyhawkes,代码行数:7,代码来源:impulse_models.py


示例12: _init_comm_bufs

    def _init_comm_bufs(self):
        """
        Buffers for sending/receiving data from other modules.

        Notes
        -----
        Must be executed after `_init_port_dicts()`.
        """

        # Buffers (and their interfaces and MPI types) for receiving data
        # transmitted from source modules:
        self._in_buf = {}
        self._in_buf['gpot'] = {}
        self._in_buf['spike'] = {}
        self._in_buf_int = {}
        self._in_buf_int['gpot'] = {}
        self._in_buf_int['spike'] = {}        
        self._in_buf_mtype = {}
        self._in_buf_mtype['gpot'] = {}
        self._in_buf_mtype['spike'] = {}        
        for in_id in self._in_ids:
            self._in_buf['gpot'][in_id] = \
                gpuarray.empty(len(self._in_port_dict_ids['gpot'][in_id]),
                               self.pm['gpot'].dtype)
            self._in_buf_int['gpot'][in_id] = bufint(self._in_buf['gpot'][in_id])
            self._in_buf_mtype['gpot'][in_id] = \
                dtype_to_mpi(self._in_buf['gpot'][in_id].dtype)
            self._in_buf['spike'][in_id] = \
                gpuarray.empty(len(self._in_port_dict_ids['spike'][in_id]),
                               self.pm['spike'].dtype)
            self._in_buf_int['spike'][in_id] = bufint(self._in_buf['spike'][in_id])
            self._in_buf_mtype['spike'][in_id] = \
                dtype_to_mpi(self._in_buf['spike'][in_id].dtype)

        # Buffers (and their interfaces and MPI types) for transmitting data to
        # destination modules:
        self._out_buf = {}
        self._out_buf['gpot'] = {}
        self._out_buf['spike'] = {}
        self._out_buf_int = {}
        self._out_buf_int['gpot'] = {}
        self._out_buf_int['spike'] = {}
        self._out_buf_mtype = {}
        self._out_buf_mtype['gpot'] = {}
        self._out_buf_mtype['spike'] = {}
        for out_id in self._out_ids:
            self._out_buf['gpot'][out_id] = \
                gpuarray.empty(len(self._out_port_dict_ids['gpot'][out_id]),
                               self.pm['gpot'].dtype)
            self._out_buf_int['gpot'][out_id] = bufint(self._out_buf['gpot'][out_id])
            self._out_buf_mtype['gpot'][out_id] = \
                dtype_to_mpi(self._out_buf['gpot'][out_id].dtype)

            self._out_buf['spike'][out_id] = \
                gpuarray.empty(len(self._out_port_dict_ids['spike'][out_id]),
                               self.pm['spike'].dtype)
            self._out_buf_int['spike'][out_id] = bufint(self._out_buf['spike'][out_id])
            self._out_buf_mtype['spike'][out_id] = \
                dtype_to_mpi(self._out_buf['spike'][out_id].dtype)
开发者ID:ScartleRoy,项目名称:neurokernel,代码行数:59,代码来源:core_gpu.py


示例13: getFields

 def getFields(self,x,y):
   outX = gpuarray.empty((self.Nfields,x,y),np.float32)
   outY = gpuarray.empty((self.Nfields,x,y),np.float32)
   grid = (int(ceil(x/32)),int(ceil(y/32)))
   block = (int(ceil(x/grid[0])),int(ceil(y/grid[1])),1)
   for i in range(self.Nfields):
     self.resampleF[i].prepared_call(grid,block,outX[i,:,:].gpudata,outY[i,:,:].gpudata,np.int32(x),np.int32(y))
   return outX,outY
开发者ID:LaboratoireMecaniqueLille,项目名称:GPGPU,代码行数:8,代码来源:iCorrel.py


示例14: show_values

def show_values(matrix_size, threads_per_block):
    a_cpu = np.random.randn(matrix_size, matrix_size).astype(np.float32)

    # transfer host (CPU) memory to device (GPU) memory
    a_gpu = gpuarray.to_gpu(a_cpu)
    id_groups_x = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_groups_y = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_threads_x = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_threads_y = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_cell = gpuarray.empty((matrix_size, matrix_size), np.float32)

    blocks = (threads_per_block, 1, 1)

    blocks_per_side = int(matrix_size / threads_per_block)

    if (blocks_per_side * threads_per_block) < matrix_size:
        blocks_per_side = blocks_per_side + 1

    grid = (blocks_per_side, matrix_size, 1)

    print("Blocks: ", blocks)
    print("Grid: ", grid)

    kernel_code = kernel_source_code % {'MATRIX_SIZE': matrix_size, 'BLOCK_SIZE': threads_per_block}

    compiled_kernel = compiler.SourceModule(kernel_code)

    kernel_binary = compiled_kernel.get_function("markThreadID")

    kernel_binary(
        # inputs
        a_gpu,
        # outputs
        id_groups_x, id_groups_y, id_threads_x, id_threads_y, id_cell,
        block=blocks,
        grid=grid
    )

    id_blocks_x_cpu = id_groups_x.get()
    id_blocks_y_cpu = id_groups_y.get()
    id_threads_x_cpu = id_threads_x.get()
    id_threads_y_cpu = id_threads_y.get()
    id_cell_cpu = id_cell.get()

    print("id_blocks_x_cpu")
    print(id_blocks_x_cpu)

    print("id_blocks_y_cpu")
    print(id_blocks_y_cpu)

    print("id_threads_x_cpu")
    print(id_threads_x_cpu)

    print("id_threads_y_cpu")
    print(id_threads_y_cpu)

    print("id_cell_cpu")
    print(id_cell_cpu)
开发者ID:javierip,项目名称:parallel-code-examples,代码行数:58,代码来源:main.py


示例15: initializeGpuMemory

 def initializeGpuMemory(self):
     """
     Allocate GPU memory for the base model parameters
     """
     N = self.base.data.N
     K = self.base.data.K
             
     self.gpuPtrs["proc_id_model","C"] = gpuarray.empty((N,), dtype=np.int32)
     self.gpuPtrs["proc_id_model","Ns"] = gpuarray.empty((K,), dtype=np.int32)
开发者ID:richardkwo,项目名称:pyhawkes,代码行数:9,代码来源:process_id_models.py


示例16: removeProcessEventHandler

 def removeProcessEventHandler(self, procId):
     """
     Remove process procID from the set of processes and update data structures
     accordingly. We can assume that the base model has updated K.
     """
     K = self.modelParams["proc_id_model","K"]
     
     self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
     self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) 
开发者ID:richardkwo,项目名称:pyhawkes,代码行数:9,代码来源:impulse_models.py


示例17: test_work_area

 def test_work_area(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False)
     work_area = gpuarray.empty((plan.worksize,), np.uint8)
     plan.set_work_area(work_area)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
开发者ID:Nodd,项目名称:scikit-cuda,代码行数:10,代码来源:test_fft.py


示例18: initializeGpuMemory

 def initializeGpuMemory(self):
     K = self.modelParams["proc_id_model","K"]
     N = self.base.data.N
     
     self.gpuPtrs["graph_model","A"] = gpuarray.empty((K,K), dtype=np.bool)
     self.gpuPtrs["graph_model","WGS"]   = gpuarray.empty((K,N), dtype=np.float32)
     
     qratio_width = int(np.ceil(np.float32(self.base.data.N)/self.params["blockSz"]))
     self.gpuPtrs["graph_model","qratio"] = gpuarray.empty((qratio_width,), dtype=np.float64)
     self.gpuPtrs["graph_model","lkhd_ratio"] = gpuarray.empty((1,), dtype=np.float32)
开发者ID:richardkwo,项目名称:pyhawkes,代码行数:10,代码来源:graph_model_extension.py


示例19: get_workspace

    def get_workspace(self, n):
        from pyfft.cuda import Plan as pycufftplan
        import pycuda.gpuarray as gpuarray

        ws = self.get(n)
        if ws: return ws
        return self.setdefault(n,
            (pycufftplan(int(n), stream=self.stream, normalize=False),
             gpuarray.empty(n, dtype=complex64(0.).dtype),
             gpuarray.empty(n, dtype=complex64(0.).dtype)))
开发者ID:GeraintPratten,项目名称:lalsuite,代码行数:10,代码来源:overlap_cuda.py


示例20: benchmark

    def benchmark(self):
        discr = self.discr
        given = self.plan.given

        from hedge.backends.cuda.tools import int_ceiling
        block_count = int_ceiling(
                len(discr.mesh.elements)/self.plan.elements_per_block())
        all_fluxes_on_faces = [gpuarray.empty(
            (block_count * self.plan.microblocks_per_block()
                * given.aligned_face_dofs_per_microblock(),),
                dtype=given.float_type,
                allocator=discr.pool.allocate)
                for i in range(len(self.fluxes))]

        field = gpuarray.empty(
                (self.plan.input_dofs_per_block() * block_count,),
                dtype=given.float_type,
                allocator=discr.pool.allocate)

        fdata = self.fake_flux_face_data_block(block_count)
        ilist_data = self.fake_index_list_data()

        block, gather, texref_map = self.get_kernel(fdata, ilist_data,
                for_benchmark=True)

        for dep_expr in self.all_deps:
            field.bind_to_texref_ext(texref_map[dep_expr],
                    allow_double_hack=True)

        if "cuda_fastbench" in discr.debug:
            count = 1
        else:
            count = 20

        start = cuda.Event()
        start.record()
        for i in range(count):
            if block_count >= 2**16:
                return None

            try:
                gather.prepared_call(
                        (block_count, 1), block,
                        0,
                        fdata.device_memory,
                        *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                        )
            except cuda.LaunchError:
                return None

        stop = cuda.Event()
        stop.record()
        stop.synchronize()

        return 1e-3/count * stop.time_since(start)
开发者ID:felipeh,项目名称:hedge,代码行数:55,代码来源:fluxgather.py



注:本文中的pycuda.gpuarray.empty函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python gpuarray.empty_like函数代码示例发布时间:2022-05-25
下一篇:
Python gpuarray.dot函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap