本文整理汇总了Python中pycuda.driver.memcpy_htod函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_htod函数的具体用法?Python memcpy_htod怎么用?Python memcpy_htod使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了memcpy_htod函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, view_tile, size, sigma, debug=False):
self.debug = debug
if size[0] < 2 or size[1] < 2:
raise ValueError("Split needs to be at least 2x2")
self.data_sets = view_tile.get_Data()
for dset in self.data_sets:
data = dset.getDataSet()
if not data.flags['C_CONTIGUOUS']:
print "NOT CONTIGUOUS, trying to reformat the points"
data = np.require(data, dtype=data.dtype, requirements=['C'])
if not data.flags['C_CONTIGUOUS']:
raise Exception("Points are not contiguous")
dset.setDataSet(data)
self.view_tile = view_tile
self.sigma = sigma
self.pts_gpu = None
# Initiates all of cuda stuff
self.grid = np.zeros(size).astype(np.float32)
self.grid_gpu = cuda.mem_alloc_like(self.grid)
cuda.memcpy_htod(self.grid_gpu, self.grid)
kernel = SourceModule(self.__cuda_code)
self.gpu_gaussian = kernel.get_function("gpu_gaussian")
self.view = self.view_tile.get_View()
self.grid_size, self.block_size = self.__setup_cuda_sizes(size)
self.dx = 1 / float(size[1] - 1)
self.dy = 1 / float(size[0] - 1)
开发者ID:SCIInstitute,项目名称:MLM,代码行数:33,代码来源:gaussian_gpu_grid.py
示例2: test_constant_memory
def test_constant_memory(self):
# contributed by Andrew Wagner
module = SourceModule("""
__constant__ float const_array[32];
__global__ void copy_constant_into_global(float* global_result_array)
{
global_result_array[threadIdx.x] = const_array[threadIdx.x];
}
""")
copy_constant_into_global = module.get_function("copy_constant_into_global")
const_array, _ = module.get_global('const_array')
host_array = np.random.randint(0,255,(32,)).astype(np.float32)
global_result_array = drv.mem_alloc_like(host_array)
drv.memcpy_htod(const_array, host_array)
copy_constant_into_global(
global_result_array,
grid=(1, 1), block=(32, 1, 1))
host_result_array = np.zeros_like(host_array)
drv.memcpy_dtoh(host_result_array, global_result_array)
assert (host_result_array == host_array).all
开发者ID:davidweichiang,项目名称:pycuda,代码行数:28,代码来源:test_driver.py
示例3: edgetaper_gpu
def edgetaper_gpu(y_gpu, sf, win='barthann'):
shape = np.array(y_gpu.shape).astype(np.uint32)
dtype = y_gpu.dtype
block_size = (16,16,1)
grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
int(np.ceil(float(shape[0])/block_size[1])))
# Ensure that sf is odd
sf = sf+(1-np.mod(sf,2))
wx = scipy.signal.get_window(win, sf[1])
wy = scipy.signal.get_window(win, sf[0])
maxw = wx.max() * wy.max()
hsf = np.floor(sf/2)
wx = (wx[0:hsf[1]] / maxw).astype(dtype)
wy = (wy[0:hsf[0]] / maxw).astype(dtype)
preproc = _generate_preproc(dtype, shape)
preproc += '#define wx_size %d\n' % wx.size
preproc += '#define wy_size %d\n' % wy.size
mod = SourceModule(preproc + edgetaper_code, keep=True)
edgetaper_gpu = mod.get_function("edgetaper")
wx_gpu, wx_size = mod.get_global('wx')
wy_gpu, wy_size = mod.get_global('wy')
cu.memcpy_htod(wx_gpu, wx)
cu.memcpy_htod(wy_gpu, wy)
edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]),
block=block_size, grid=grid_size)
开发者ID:matthiaslee,项目名称:VMBD,代码行数:31,代码来源:gputools.py
示例4: prepare_device_arrays
def prepare_device_arrays(self):
self.maxLayers = self.grid_prop.GetMaxLayers()
nczbins_fine = len(self.czcen_fine)
numLayers = np.zeros(nczbins_fine,dtype=np.int32)
densityInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
self.grid_prop.GetNumberOfLayers(numLayers)
self.grid_prop.GetDensityInLayer(densityInLayer)
self.grid_prop.GetDistanceInLayer(distanceInLayer)
# Copy all these earth info arrays to device:
self.d_numLayers = cuda.mem_alloc(numLayers.nbytes)
self.d_densityInLayer = cuda.mem_alloc(densityInLayer.nbytes)
self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
cuda.memcpy_htod(self.d_numLayers,numLayers)
cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)
self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)
return
开发者ID:gkrueckl,项目名称:pisa,代码行数:26,代码来源:Prob3GPUOscillationService.py
示例5: _set
def _set(self, ary):
# Allocate a new buffer with suitable padding and pack it
buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
buf[:, :self.ncol] = self._pack(ary)
# Copy
cuda.memcpy_htod(self.data, buf)
开发者ID:pv101,项目名称:PyFR,代码行数:7,代码来源:types.py
示例6: from_np
def from_np(np_data):
cudabuf = cuda.mem_alloc(np_data.nbytes)
cuda.memcpy_htod(cudabuf, np_data)
# self.cpudata = np_data
tensor = MyTensor(cudabuf, shape=np_data.shape, size=np_data.size)
tensor.cpudata = np_data
return tensor
开发者ID:hughperkins,项目名称:neon,代码行数:7,代码来源:test_correctness.py
示例7: cuda_crossOver
def cuda_crossOver(sola, solb):
""" """
sol_len = len(sola);
a_gpu = cuda.mem_alloc(sola.nbytes);
b_gpu = cuda.mem_alloc(solb.nbytes);
cuda.memcpy_htod(a_gpu, sola);
cuda.memcpy_htod(b_gpu, solb);
func = mod.get_function("crossOver");
func(a_gpu,b_gpu, block=(sol_len,1,1));
a_new = numpy.empty_like(sola);
b_new = numpy.empty_like(solb);
cuda.memcpy_dtoh(a_new, a_gpu);
cuda.memcpy_dtoh(b_new, b_gpu);
if debug == True:
print "a:", a;
print "b:",b;
print "new a:",a_new;
print "new b:",b_new;
return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py
示例8: _to_device
def _to_device(self, module):
ptr, size = module.get_global(self.name)
if size != self.data.nbytes:
raise RuntimeError("Const %s needs %d bytes, but only space for %d" % (self, self.data.nbytes, size))
if self.state is DeviceDataMixin.HOST:
driver.memcpy_htod(ptr, self._data)
self.state = DeviceDataMixin.BOTH
开发者ID:RomainBrault,项目名称:PyOP2,代码行数:7,代码来源:cuda.py
示例9: __init__
def __init__(self, n_dict, V, dt, debug=False):
self.num_neurons = len(n_dict['id'])
self.dt = np.double(dt)
self.steps = max(int(round(dt / 1e-5)), 1)
self.debug = debug
self.ddt = dt / self.steps
self.V = V
self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64))
self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64))
self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64))
self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64))
self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64))
self.V_l = garray.to_gpu(np.asarray(n_dict['V_l'], dtype = np.float64))
self.V_ca = garray.to_gpu(np.asarray(n_dict['V_ca'], dtype = np.float64))
self.V_k = garray.to_gpu(np.asarray(n_dict['V_k'], dtype = np.float64))
self.G_l = garray.to_gpu(np.asarray(n_dict['G_l'], dtype = np.float64))
self.G_ca = garray.to_gpu(np.asarray(n_dict['G_ca'], dtype = np.float64))
self.G_k = garray.to_gpu(np.asarray(n_dict['G_k'], dtype = np.float64))
self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64))
self.offset = garray.to_gpu(np.asarray(n_dict['offset'],
dtype=np.float64))
cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'],
dtype=np.double))
self.update = self.get_euler_kernel()
开发者ID:yiyin,项目名称:neurokernel,代码行数:30,代码来源:MorrisLecar_a.py
示例10: evaluate
def evaluate(self, params, returnOutputs=False):
"""Evaluate several networks (with given params) on training set.
@param params: network params
@type params: list of Parameters
@param returnOutputs: return network output values (debug)
@type returnOutputs: bool, default False
@return output matrix if returnOutputs=True, else None
"""
if self.popSize != len(params):
raise ValueError("Need %d Parameter structures (provided %d)" % (
self.popSize, len(params)))
paramArrayType = Parameters * len(params)
driver.memcpy_htod(self.params, paramArrayType(*params))
# TODO: remove
driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
self.evaluateKernel.prepared_call(self.evaluateGridDim,
self.trainSetDev,
self.trainSet.size,
self.params,
self.popSize,
self.outputs)
driver.Context.synchronize()
self.outputsMat = driver.from_device(self.outputs,
shape=(self.popSize, self.trainSet.size),
dtype=np.float32)
if returnOutputs:
return self.outputsMat
开发者ID:cpatulea,项目名称:evolution,代码行数:35,代码来源:ann.py
示例11: __init__
def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False):
if cuda_verbose:
self.compile_options = ["--ptxas-options=-v"]
else:
self.compile_options = []
self.num_neurons = len(n_dict["id"])
self.dt = np.double(dt)
self.steps = max(int(round(dt / 1e-5)), 1)
self.debug = debug
self.ddt = dt / self.steps
self.V = V
self.n = garray.to_gpu(np.asarray(n_dict["initn"], dtype=np.float64))
self.V_1 = garray.to_gpu(np.asarray(n_dict["V1"], dtype=np.float64))
self.V_2 = garray.to_gpu(np.asarray(n_dict["V2"], dtype=np.float64))
self.V_3 = garray.to_gpu(np.asarray(n_dict["V3"], dtype=np.float64))
self.V_4 = garray.to_gpu(np.asarray(n_dict["V4"], dtype=np.float64))
self.V_l = garray.to_gpu(np.asarray(n_dict["V_l"], dtype=np.float64))
self.V_ca = garray.to_gpu(np.asarray(n_dict["V_ca"], dtype=np.float64))
self.V_k = garray.to_gpu(np.asarray(n_dict["V_k"], dtype=np.float64))
self.G_l = garray.to_gpu(np.asarray(n_dict["G_l"], dtype=np.float64))
self.G_ca = garray.to_gpu(np.asarray(n_dict["G_ca"], dtype=np.float64))
self.G_k = garray.to_gpu(np.asarray(n_dict["G_k"], dtype=np.float64))
self.Tphi = garray.to_gpu(np.asarray(n_dict["phi"], dtype=np.float64))
self.offset = garray.to_gpu(np.asarray(n_dict["offset"], dtype=np.float64))
cuda.memcpy_htod(int(self.V), np.asarray(n_dict["initV"], dtype=np.double))
self.update = self.get_euler_kernel()
开发者ID:neurokernel,项目名称:neurodriver,代码行数:32,代码来源:MorrisLecar_a.py
示例12: __compile_kernels
def __compile_kernels(self):
""" DFS module """
f = self.forest
self.find_min_kernel = f.find_min_kernel
self.fill_kernel = f.fill_kernel
self.scan_reshuffle_tex = f.scan_reshuffle_tex
self.comput_total_2d = f.comput_total_2d
self.reduce_2d = f.reduce_2d
self.scan_total_2d = f.scan_total_2d
self.scan_reduce = f.scan_reduce
""" BFS module """
self.scan_total_bfs = f.scan_total_bfs
self.comput_bfs_2d = f.comput_bfs_2d
self.fill_bfs = f.fill_bfs
self.reshuffle_bfs = f.reshuffle_bfs
self.reduce_bfs_2d = f.reduce_bfs_2d
self.get_thresholds = f.get_thresholds
""" Other """
self.predict_kernel = f.predict_kernel
self.mark_table = f.mark_table
const_sorted_indices = f.bfs_module.get_global("sorted_indices_1")[0]
const_sorted_indices_ = f.bfs_module.get_global("sorted_indices_2")[0]
cuda.memcpy_htod(const_sorted_indices, np.uint64(self.sorted_indices_gpu.ptr))
cuda.memcpy_htod(const_sorted_indices_, np.uint64(self.sorted_indices_gpu_.ptr))
开发者ID:phecy,项目名称:CudaTree,代码行数:26,代码来源:random_tree.py
示例13: calc_bandwidth_h2d
def calc_bandwidth_h2d( s ):
t1 = datetime.now()
cuda.memcpy_htod( s.dev_a, s.a )
dt = datetime.now() - t1
dt_float = dt.seconds + dt.microseconds*1e-6
return s.nbytes/dt_float/gbytes
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:7,代码来源:150-gpus-mpi-range-h5-seperate.py
示例14: __compute_guassian_on_pts
def __compute_guassian_on_pts(self):
view = self.view_tile.get_View()
for dset in self.data_sets:
_data = np.array(dset.getDataSet(), copy=True)
_data[:, 0] = (_data[:, 0] - view.left)/view.width()
_data[:, 1] = (_data[:, 1] - view.bottom)/view.height()
for row in range(self.grid_size[0]):
for col in range(self.grid_size[1]):
# 3 * SIGMA give the 95%
left = 1 / float(self.grid_size[1]) * col - (3 * self.sigma)
right = 1 / float(self.grid_size[1]) * (col + 1) + (3 * self.sigma)
bottom = 1 / float(self.grid_size[0]) * row - (3 * self.sigma)
top = 1 / float(self.grid_size[0]) * (row + 1) + (3 * self.sigma)
pts = getFilteredDataSet(_data, (left, right, bottom, top))
if len(pts) > 0:
self.pts_gpu = cuda.mem_alloc_like(pts)
cuda.memcpy_htod(self.pts_gpu, pts)
self.gpu_gaussian(self.grid_gpu, # Grid
self.pts_gpu, # Points
np.int32(col), # Block Index x
np.int32(row), # Block Index y
np.int32(self.grid_size[1]), # Grid Dimensions x
np.int32(self.grid_size[0]), # Grid Dimensions y
np.int32(pts.shape[0]), # Point Length
np.float32(self.dx), # dx
np.float32(self.dy), # dy
np.float32(self.sigma), # Sigma
block=self.block_size)
self.pts_gpu.free()
开发者ID:SCIInstitute,项目名称:MLM,代码行数:34,代码来源:gaussian_gpu_grid.py
示例15: interior_buffer
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
# create Cheetah template and fill in variables for mask kernel
mask_template = Template(mask_source)
mask_template.BLOCK_DIM_X = b_size[0]
mask_template.BLOCK_DIM_Y = b_size[1]
mask_template.WIDTH = dest_im.shape[1]
mask_template.HEIGHT = dest_im.shape[0]
mask_template.RGB = RGB
mask_template.NEIGHBORS = neighbors
# compile the CUDA kernel
mask_kernel = cuda_compile(mask_template, "mask_kernel")
# alloc memory to GPU
d_source = cu.mem_alloc(source_im.nbytes)
cu.memcpy_htod(d_source, source_im)
# sends to GPU filter out interior points in the mask
mask_kernel(d_source, block=b_size, grid=g_size)
# retrieves interior point buffer from GPU
inner_buffer = np.array(dest_im, dtype =np.uint8)
cu.memcpy_dtoh(inner_buffer, d_source)
# returns the interior buffer
return inner_buffer
开发者ID:JMTing,项目名称:cs205,代码行数:26,代码来源:parallel_poisson.py
示例16: __init__
def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False):
if cuda_verbose:
self.compile_options = ['--ptxas-options=-v']
else:
self.compile_options = []
self.num_neurons = len(n_dict['id'])
self.dt = np.double(dt)
self.steps = max(int(round(dt / 1e-5)),1)
self.debug = debug
self.ddt = dt / self.steps
self.V = V
self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64))
self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64))
self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64))
self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64))
self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64))
self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64))
self.offset = garray.to_gpu(np.asarray(n_dict['offset'],
dtype=np.float64))
cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'], dtype=np.double))
self.update = self.get_euler_kernel()
开发者ID:neurokernel,项目名称:neurodriver,代码行数:27,代码来源:MorrisLecarCopy.py
示例17: compile_for_GPU
def compile_for_GPU(function_package, kernel_function_name='default'):
kernel_code = ''
if kernel_function_name == 'default':
kernel_code = attachment
source_module_dict[kernel_function_name] = CustomSourceModule(kernel_code)
else:
fp = function_package
from vivaldi_translator import translate_to_CUDA
function_name = fp.function_name
Vivaldi_code = function_code_dict[function_name]
function_code = translate_to_CUDA(Vivaldi_code=Vivaldi_code, function_name=function_name, function_arguments=fp.function_args)
kernel_code = attachment + 'extern "C"{\n'
kernel_code += function_code
kernel_code += '\n}'
if True: # print for debugging
f = open('asdf.cu','w')
f.write(kernel_code)
f.close()
#print function_code
args = [kernel_code]
source_module_dict[kernel_function_name] = CustomSourceModule(kernel_code)
temp,_ = source_module_dict[kernel_function_name].get_global('DEVICE_NUMBER')
cuda.memcpy_htod(temp, numpy.int32(device_number))
func_dict[kernel_function_name] = source_module_dict[kernel_function_name].get_function(kernel_function_name)
create_helper_textures(source_module_dict[kernel_function_name])
开发者ID:Anukura,项目名称:Vivaldi,代码行数:34,代码来源:GPU_unit.py
示例18: set
def set(self, ary, device=None):
"""
copy host array to device.
Arguments:
ary: host array, needs to be contiguous
device: device id, if not the one attached to current context
Returns:
self
"""
assert ary.size == self.size
assert self.is_contiguous, "Array in set() must be contiguous"
if ary.dtype is not self.dtype:
ary = ary.astype(self.dtype)
assert ary.strides == self.strides
if device is None:
drv.memcpy_htod(self.gpudata, ary)
else:
# with multithreaded datasets, make a context before copying
# and destroy it again once done.
ctx = drv.Device(device).make_context()
drv.memcpy_htod(self.gpudata, ary)
ctx.pop()
del ctx
return self
开发者ID:KayneWest,项目名称:nervanagpu,代码行数:26,代码来源:nervanagpu.py
示例19: _read_LPU_input
def _read_LPU_input(self, in_gpot_dict, in_spike_dict):
"""
Put inputs from other LPUs to buffer.
"""
for other_lpu, gpot_data in in_gpot_dict.iteritems():
i = self.other_lpu_map[other_lpu]
if self.num_input_gpot_neurons[i] > 0:
cuda.memcpy_htod(int(int(self.buffer.gpot_buffer.gpudata) \
+(self.buffer.gpot_current * self.buffer.gpot_buffer.ld \
+ self.my_num_gpot_neurons + self.cum_virtual_gpot_neurons[i]) \
* self.buffer.gpot_buffer.dtype.itemsize), gpot_data)
if self.debug:
self.in_gpot_files[other_lpu].root.array.append(gpot_data.reshape(1,-1))
#Will need to change this if only spike indexes are transmitted
for other_lpu, sparse_spike in in_spike_dict.iteritems():
i = self.other_lpu_map[other_lpu]
if self.num_input_spike_neurons[i] > 0:
full_spike = np.zeros(self.num_input_spike_neurons[i],dtype=np.int32)
if len(sparse_spike)>0:
idx = np.asarray([self.input_spike_idx_map[i][k] \
for k in sparse_spike], dtype=np.int32)
full_spike[idx] = 1
cuda.memcpy_htod(int(int(self.buffer.spike_buffer.gpudata) \
+(self.buffer.spike_current * self.buffer.spike_buffer.ld \
+ self.my_num_spike_neurons + self.cum_virtual_spike_neurons[i]) \
* self.buffer.spike_buffer.dtype.itemsize), full_spike)
开发者ID:LuisMoralesAlonso,项目名称:neurokernel,代码行数:31,代码来源:LPU.py
示例20: __init__
def __init__(self, pts, axis, split, sigma):
if split[0] < 2 or split[1] < 2:
raise ValueError("Split needs to be at least 2x2")
if not pts.flags['C_CONTIGUOUS']:
pts = np.require(pts, dtype=pts.dtype, requirements=['C'])
if not pts.flags['C_CONTIGUOUS']:
raise Exception("Points are not contiguous")
self.axis = axis
self.sigma = sigma
self.pts = pts
self.pts_gpu = None
# Initiates all of cuda stuff
self.grid = np.zeros(split).astype(pts.dtype)
self.grid_gpu = cuda.mem_alloc_like(self.grid)
cuda.memcpy_htod(self.grid_gpu, self.grid)
kernel = SourceModule(self.__cuda_code)
self.gpu_gaussian = kernel.get_function("gpu_gaussian")
self.dx = 1 / float(split[0] - 1)
self.dy = 1 / float(split[1] - 1)
self.grid_size, self.block_size = self.__setup_cuda_sizes(split)
开发者ID:SCIInstitute,项目名称:MLM,代码行数:26,代码来源:gaussian_gpu.py
注:本文中的pycuda.driver.memcpy_htod函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论