本文整理汇总了Python中pycuda.driver.pagelocked_zeros函数的典型用法代码示例。如果您正苦于以下问题:Python pagelocked_zeros函数的具体用法?Python pagelocked_zeros怎么用?Python pagelocked_zeros使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pagelocked_zeros函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_streamed_kernel
def test_streamed_kernel(self):
# this differs from the "simple_kernel" case in that *all* computation
# and data copying is asynchronous. Observe how this necessitates the
# use of page-locked memory.
mod = drv.SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x*blockDim.y + threadIdx.y;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
import numpy
shape = (32,8)
a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
a[:] = numpy.random.randn(*shape)
b[:] = numpy.random.randn(*shape)
strm = drv.Stream()
dest = drv.pagelocked_empty_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=shape+(1,), stream=strm)
strm.synchronize()
self.assert_(la.norm(dest-a*b) == 0)
开发者ID:berlinguyinca,项目名称:pycuda,代码行数:31,代码来源:test_driver.py
示例2: __call__
def __call__(self):
spikes = self.collected_spikes[:self.nspikes]
total_neurons = self.net.total_neurons
if self.use_gpu:
if not hasattr(self, 'spikes_gpu'):
spikes_bool = drv.pagelocked_zeros(total_neurons, dtype=uint32)
spikes_bool[spikes] = True
spikes_gpu = pycuda.gpuarray.to_gpu(spikes_bool)
spikes_gpu_ptr = int(int(spikes_gpu.gpudata))
self.spikes_bool = spikes_bool
self.spikes_gpu = spikes_gpu
self.spikes_gpu_ptr = spikes_gpu_ptr
else:
spikes_bool = self.spikes_bool
spikes_bool[:] = False
spikes_bool[spikes] = True
spikes_gpu = self.spikes_gpu
pycuda.driver.memcpy_htod(spikes_gpu.gpudata, spikes_bool)
spikes_gpu_ptr = self.spikes_gpu_ptr
acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
spikes_gpu_ptr, total_neurons)
if not hasattr(self, 'acc'):
self.acc = acc = drv.pagelocked_zeros(total_neurons, dtype=float32)
else:
acc = self.acc
pycuda.driver.memcpy_dtoh(acc, acc_ptr)
else:
spikes_ptr = spikes.ctypes.data
spikes_len = len(spikes)
acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
spikes_ptr, spikes_len)
acc = numpy_array_from_memory(acc_ptr, total_neurons, float32)
for _, targetvar, targetslice in self.net.nemo_propagate_targets:
targetvar += acc[targetslice]
self.nspikes = 0
开发者ID:JoErNanO,项目名称:brian,代码行数:35,代码来源:briantonemo.py
示例3: _allocate_arrays
def _allocate_arrays(self):
#allocate gpu arrays and numpy arrays.
if self.max_features < 4:
imp_size = 4
else:
imp_size = self.max_features
#allocate gpu arrays
self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32)
self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32)
self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts)
self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices)
self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels,
self.dtype_indices)
self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32)
self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts)
self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16)
self.mark_table = gpuarray.empty(self.stride, np.uint8)
#allocate numpy arrays
self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32)
self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8)
self.nid_array = np.zeros(self.n_samples, dtype = np.uint32)
self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices)
self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8)
self.threshold_value_idx = np.zeros(2, self.dtype_indices)
self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32)
self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16)
self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
开发者ID:pjankiewicz,项目名称:CudaTree,代码行数:29,代码来源:random_forest.py
示例4: getRT
def getRT(self, s_map, srt_gpu, srt_nsamp, srt_npairs, npairs, store_rt=False):
"""
Computes the rank template
s_map(Sample Map) - an list of 1s and 0s of length nsamples where 1 means use this sample
to compute rank template
srt_gpu - cuda memory object containing srt(sample rank template) array on gpu
srt_nsamp, srt_npairs - shape(buffered) of srt_gpu object
npairs - true number of gene pairs being compared
b_size - size of the blocks for computation
store_rt - determines the RETURN value
False(default) = returns an numpy array shape(npairs) of the rank template
True = returns the rt_gpu object and the padded size of the rt_gpu objet (rt_obj, npairs_padded)
"""
b_size = self.b_size
s_map_buff = self.s_map_buff = cuda.pagelocked_zeros((int(srt_nsamp),), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
s_map_buff[:len(s_map)] = np.array(s_map,dtype=np.int32)
s_map_gpu = np.intp(s_map_buff.base.get_device_pointer())
#cuda.memcpy_htod(s_map_gpu, s_map_buff)
#sample blocks
g_y_sz = self.getGrid( srt_nsamp)
#pair blocks
g_x_sz = self.getGrid( srt_npairs )
block_rt_gpu = cuda.mem_alloc(int(g_y_sz*srt_npairs*(np.uint32(1).nbytes)) )
grid = (g_x_sz, g_y_sz)
func1,func2 = self.getrtKern(g_y_sz)
shared_size = b_size*b_size*np.uint32(1).nbytes
func1( srt_gpu, np.uint32(srt_nsamp), np.uint32(srt_npairs), s_map_gpu, block_rt_gpu, np.uint32(g_y_sz), block=(b_size,b_size,1), grid=grid, shared=shared_size)
rt_buffer =self.rt_buffer = cuda.pagelocked_zeros((int(srt_npairs),), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
rt_gpu = np.intp(rt_buffer.base.get_device_pointer())
func2( block_rt_gpu, rt_gpu, np.int32(s_map_buff.sum()), block=(b_size,1,1), grid=(g_x_sz,))
if store_rt:
#this is in case we want to run further stuff without
#transferring back and forth
return (rt_gpu, srt_npairs)
else:
#rt_buffer = np.zeros((srt_npairs ,), dtype=np.int32)
#cuda.memcpy_dtoh(rt_buffer, rt_gpu)
#rt_gpu.free()
return rt_buffer[:npairs]
开发者ID:JohnCEarls,项目名称:tcDirac,代码行数:54,代码来源:gpu.py
示例5: prepare
def prepare(self, P):
n = len(P.state_(self.eqs._diffeq_names_nonzero[0]))
var_len = len(dict.fromkeys(self.eqs._diffeq_names))+1 # +1 needed to store t
for index,varname in enumerate(self.eqs._diffeq_names):
self.index_to_varname.append(varname)
self.varname_to_index[varname]= index
if varname in self.eqs._diffeq_names_nonzero :
self.index_nonzero.append(index)
self.S_in = cuda.pagelocked_zeros((n,var_len),numpy.float64)
self.S_out = cuda.pagelocked_zeros((n,var_len),numpy.float64)
nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize
self.S_in_gpu = cuda.mem_alloc(nbytes)
self.S_out_gpu = cuda.mem_alloc(nbytes)
Z = zeros((n,var_len))
self.A_gpu = cuda.mem_alloc(nbytes)
cuda.memcpy_htod(self.A_gpu, Z)
self.B_gpu = cuda.mem_alloc(nbytes)
cuda.memcpy_htod(self.B_gpu, Z)
self.S_temp_gpu = cuda.mem_alloc(nbytes)
modFun={}
self.applyFun = {}
for x in self.index_nonzero:
s = self.eqs._function_C_String[self.index_to_varname[x]]
args_fun =[]
for i in xrange(var_len):
args_fun.append("S_temp["+str(i)+" + blockIdx.x * var_len]")
modFun[x] = SourceModule("""
__device__ double f"""+ s +"""
__global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len)
{
int idx = x + blockIdx.x * var_len;
S_temp[idx] = 0;
B[idx] = f("""+",".join(args_fun)+""");
S_temp[idx] = 1;
A[idx] = f("""+",".join(args_fun)+""") - B[idx];
B[idx] /= A[idx];
S_temp[idx] = S_in[idx];
}
""")
self.applyFun[x] = modFun[x].get_function("applyFun")
self.applyFun[x].prepare(['P','P','P','P','i','i'],block=(1,1,1))
self.calc_dict = {}
self.already_calc = {}
开发者ID:JoErNanO,项目名称:brian,代码行数:52,代码来源:gpustateupdater.py
示例6: _initialize_gpu_ds
def _initialize_gpu_ds(self):
"""
Setup GPU arrays.
"""
self.synapse_state = garray.zeros(
max(int(self.total_synapses) + len(self.input_neuron_list), 1),
np.float64)
if self.total_num_gpot_neurons>0:
# self.V = garray.zeros(
# int(self.total_num_gpot_neurons),
# np.float64)
self.V_host = drv.pagelocked_zeros(
int(self.total_num_gpot_neurons),
np.float64, mem_flags=drv.host_alloc_flags.DEVICEMAP)
self.V = garray.GPUArray(self.V_host.shape,
self.V_host.dtype,
gpudata=self.V_host.base.get_device_pointer())
else:
self.V = None
if self.total_num_spike_neurons > 0:
# self.spike_state = garray.zeros(int(self.total_num_spike_neurons),
# np.int32)
self.spike_state_host = drv.pagelocked_zeros(int(self.total_num_spike_neurons),
np.int32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
self.spike_state = garray.GPUArray(self.spike_state_host.shape,
self.spike_state_host.dtype,
gpudata=self.spike_state_host.base.get_device_pointer())
self.block_extract = (256, 1, 1)
if len(self.out_ports_ids_gpot) > 0:
self.out_ports_ids_gpot_g = garray.to_gpu(self.out_ports_ids_gpot)
self.sel_out_gpot_ids_g = garray.to_gpu(self.sel_out_gpot_ids)
self._extract_gpot = self._extract_projection_gpot_func()
if len(self.out_ports_ids_spk) > 0:
self.out_ports_ids_spk_g = garray.to_gpu(
(self.out_ports_ids_spk).astype(np.int32))
self.sel_out_spk_ids_g = garray.to_gpu(self.sel_out_spk_ids)
self._extract_spike = self._extract_projection_spike_func()
if self.ports_in_gpot_mem_ind is not None:
inds = self.sel_in_gpot_ids
self.inds_gpot = garray.to_gpu(inds)
if self.ports_in_spk_mem_ind is not None:
inds = self.sel_in_spk_ids
self.inds_spike = garray.to_gpu(inds)
开发者ID:neurokernel,项目名称:neurodriver-benchmark,代码行数:51,代码来源:neurodriver_demo.py
示例7: getBuff
def getBuff(self, frm, new_r, new_c, b_dtype):
"""
Generates a numpy array sized (new_r,new_x) of dtype
b_dtype that contains the np array frm such that
frm[i,j] == new[i,j] wher new has zeros if
frm[i,j] is out of bounds.
"""
try:
old_r,old_c = frm.shape
buff = cuda.pagelocked_zeros((new_r,new_c),b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP)#np.zeros((new_r,new_c),dtype=b_dtype)
buff[:old_r,:old_c] = frm
except ValueError:
#oned
old_r = frm.shape[0]
buff = cuda.pagelocked_zeros((new_r,), b_dtype,mem_flags=cuda.host_alloc_flags.DEVICEMAP)# np.zeros((new_r,),dtype=b_dtype)
buff[:old_r] = frm
return buff
开发者ID:JohnCEarls,项目名称:tcDirac,代码行数:17,代码来源:gpu.py
示例8: GenerateFractal
def GenerateFractal(dimensions,position,zoom,iterations,block=(20,20,1), report=False, silent=False):
chunkSize = numpy.array([dimensions[0]/block[0],dimensions[1]/block[1]],dtype=numpy.int32)
zoom = numpy.float32(zoom)
iterations = numpy.int32(iterations)
blockDim = numpy.array([block[0],block[1]],dtype=numpy.int32)
result = numpy.zeros(dimensions,dtype=numpy.int32)
#Center position
position = Vector(position[0]*zoom,position[1]*zoom)
position = position - (Vector(result.shape[0],result.shape[1])/2)
position = numpy.array([int(position.x),int(position.y)]).astype(numpy.float32)
#For progress reporting:
ppc = cuda.pagelocked_zeros((1,1),numpy.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) #pagelocked progress counter
ppc[0,0] = 0
ppc_ptr = numpy.intp(ppc.base.get_device_pointer()) #pagelocked memory counter, device pointer to
#End progress reporting
#Copy parameters over to device
chunkS = In(chunkSize)
posit = In(position)
blockD = In(blockDim)
zoo = In(zoom)
iters = In(iterations)
res = In(result)
if not silent:
print("Calling CUDA function. Starting timer. progress starting at: "+str(ppc[0,0]))
start_time = time.time()
genChunk(chunkS, posit, blockD, zoo, iters, res, ppc_ptr, block=(1,1,1), grid=block)
if report:
total = (dimensions[0]*dimensions[1])
print "Reporting up to "+str(total)+", "+str(ppc[0,0])
while ppc[0,0] < ((dimensions[0]*dimensions[1])):
pct = (ppc[0,0]*100)/(total)
hashes = "#"*pct
dashes = "-"*(100-pct)
print "\r["+hashes+dashes+"] "+locale.format("%i",ppc[0,0],grouping=True)+"/"+locale.format("%i",total,grouping=True),
time.sleep(0.00001)
cuda.Context.synchronize()
if not silent:
print "Done. "+str(ppc[0,0])
#Copy result back from device
cuda.memcpy_dtoh(result, res)
if not silent:
end_time = time.time()
elapsed_time = end_time-start_time
print("Done with call. Took "+str(elapsed_time)+" seconds. Here's the repr'd arary:\n")
print(result)
result[result.shape[0]/2,result.shape[1]/2]=iterations+1 #mark center of image
return result
开发者ID:jshearer,项目名称:cudafractal,代码行数:58,代码来源:fractal.py
示例9: find_component_device
def find_component_device(d_v, d_D, length):
"""
:param d_v:
:param d_D:
:param ecount:
:return:
"""
import eulercuda.pyencode as enc
logger = logging.getLogger('eulercuda.pycomponent.find_component_device')
logger.info("started.")
mem_size = length
d_prevD = np.zeros(mem_size, dtype=np.uintc)
d_Q = np.zeros_like(d_prevD)
d_t1 = np.zeros_like(d_prevD)
d_t2 = np.zeros_like(d_prevD)
d_val1 = np.zeros_like(d_prevD)
d_val2 = np.zeros_like(d_prevD)
sp = np.uintc(0)
s = np.uintc
d_D, d_Q = component_step_init(d_v, d_D, d_Q, length)
s, sp = 1, 1
sptemp = drv.pagelocked_zeros(4, dtype=np.intc, mem_flags=drv.host_alloc_flags.DEVICEMAP)
d_sptemp = np.intp(sptemp.base.get_device_pointer())
while s == sp:
d_D, d_prevD = d_prevD, d_D
d_D = component_step1_shortcutting_p1(d_v, d_prevD, d_D, d_Q, length, s)
d_Q = component_step1_shortcutting_p2(d_v, d_prevD, d_D, d_Q, length, s)
d_t1, d_t2, d_val1, d_val2 = component_Step2_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)
d_D, d_Q = component_Step2_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)
d_t1, d_t2, d_val1, d_val2 = component_Step3_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)
d_D = component_Step3_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)
d_val1 = component_step4_P1(d_v, d_D, d_val1, length)
d_D = component_step4_P2(d_v, d_D, d_val1, length)
sptemp[0] = 0
d_sptemp = (d_Q, length, d_sptemp, s)
sp += sptemp[0]
s += 1
logger.info("Finished. Leaving.")
return d_D
开发者ID:zenlc2000,项目名称:pycuda-euler,代码行数:57,代码来源:pycomponent.py
示例10: test_streamed_kernel
def test_streamed_kernel(self):
# this differs from the "simple_kernel" case in that *all* computation
# and data copying is asynchronous. Observe how this necessitates the
# use of page-locked memory.
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x*blockDim.y + threadIdx.y;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
shape = (32, 8)
a = drv.pagelocked_zeros(shape, dtype=np.float32)
b = drv.pagelocked_zeros(shape, dtype=np.float32)
a[:] = np.random.randn(*shape)
b[:] = np.random.randn(*shape)
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
strm = drv.Stream()
drv.memcpy_htod_async(a_gpu, a, strm)
drv.memcpy_htod_async(b_gpu, b, strm)
strm.synchronize()
dest = drv.pagelocked_empty_like(a)
multiply_them(
drv.Out(dest), a_gpu, b_gpu,
block=shape+(1,), stream=strm)
strm.synchronize()
drv.memcpy_dtoh_async(a, a_gpu, strm)
drv.memcpy_dtoh_async(b, b_gpu, strm)
strm.synchronize()
assert la.norm(dest-a*b) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:40,代码来源:test_driver.py
示例11: getRMS
def getRMS(self, rt_gpu, srt_gpu, padded_samples, padded_npairs, samp_id, npairs):
"""
Returns the rank matching score
rt_gpu - rank template gpu object (padded_npairs,)
srt_gpu - sample rank template gpu object (padded_npairs, padded_samples)
samp_id - the sample id to compare srt to rt
npairs - true number of pairs
b_size - the block size for gpu computation.
"""
b_size = self.b_size
gsize = int(padded_npairs/b_size)
result = self.result= cuda.pagelocked_zeros((gsize,), dtype=np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
result_gpu = np.intp(result.base.get_device_pointer()) #cuda.mem_alloc(result.nbytes)
func = self.getrmsKern()
func( rt_gpu, srt_gpu, np.int32(samp_id), np.int32(padded_samples), np.int32(npairs), result_gpu, block=(b_size,1,1), grid=(int(gsize),), shared=b_size*np.uint32(1).nbytes )
self.ctx.synchronize()
return result.sum()/float(npairs)
开发者ID:JohnCEarls,项目名称:tcDirac,代码行数:19,代码来源:gpu.py
示例12: __init__
def __init__(self, N, model, threshold=None, reset=NoReset(),
init=None, refractory=0 * msecond, level=0,
clock=None, order=1, implicit=False, unit_checking=True,
max_delay=0 * msecond, compile=False, freeze=False, method=None,
precision='double', maxblocksize=512, forcesync=False, pagelocked_mem=True,
gpu_to_cpu_vars=None, cpu_to_gpu_vars=None):
eqs = model
eqs.prepare()
NeuronGroup.__init__(self, N, eqs, threshold=threshold, reset=reset,
init=init, refractory=refractory, level=level,
clock=clock, order=order, compile=compile, freeze=freeze, method=method)
self.precision = precision
if self.precision == 'double':
self.precision_dtype = float64
self.precision_nbytes = 8
else:
self.precision_dtype = float32
self.precision_nbytes = 4
self.clock = guess_clock(clock)
if gpu_to_cpu_vars is None and cpu_to_gpu_vars is None:
self._state_updater = GPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize,
forcesync=forcesync)
else:
cpu_to_gpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes,
self.get_var_index(var) * len(self),
(self.get_var_index(var) + 1) * len(self)) for var in cpu_to_gpu_vars]
gpu_to_cpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes,
self.get_var_index(var) * len(self),
(self.get_var_index(var) + 1) * len(self)) for var in gpu_to_cpu_vars]
self._state_updater = UserControlledGPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize,
gpu_to_cpu_vars=gpu_to_cpu_vars, cpu_to_gpu_vars=cpu_to_gpu_vars)
if pagelocked_mem:
self._S = GPUBufferedArray(drv.pagelocked_zeros(self._S.shape, dtype=self.precision_dtype))
else:
self._S = GPUBufferedArray(array(self._S, dtype=self.precision_dtype))
self._gpuneurongroup_init_finished = True
开发者ID:brian-team,项目名称:brian,代码行数:36,代码来源:gpucodegen.py
示例13: __init__
def __init__(self, source, b, a, samplerate=None,
precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None):
# Automatically duplicate mono input to fit the desired output shape
if b.shape[0]!=source.nchannels:
if source.nchannels!=1:
raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.')
source = RestructureFilterbank(source, b.shape[0])
Filterbank.__init__(self, source)
if pycuda.context is None:
set_gpu_device(0)
self.precision=precision
if self.precision=='double':
self.precision_dtype=float64
else:
self.precision_dtype=float32
self.forcesync=forcesync
self.pagelocked_mem=pagelocked_mem
n, m, p=b.shape
self.filt_b=b
self.filt_a=a
filt_b_gpu=array(b, dtype=self.precision_dtype)
filt_a_gpu=array(a, dtype=self.precision_dtype)
filt_state=zeros((n, m-1, p), dtype=self.precision_dtype)
if pagelocked_mem:
filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
else:
filt_y=zeros(n, dtype=self.precision_dtype)
self.pre_x=zeros(n, dtype=self.precision_dtype)
self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem
self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds
self.filt_state=gpuarray.to_gpu(filt_state.T.flatten())
self.unroll_filterorder = unroll_filterorder
if unroll_filterorder is None:
if m<=32:
unroll_filterorder = True
else:
unroll_filterorder = False
# TODO: improve code, check memory access patterns, maybe use local memory
code='''
#define x(s,i) _x[(s)*n+(i)]
#define y(s,i) _y[(s)*n+(i)]
#define a(i,j,k) _a[(i)+(j)*n+(k)*n*m]
#define b(i,j,k) _b[(i)+(j)*n+(k)*n*m]
#define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)]
__global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples)
{
int j = blockIdx.x * blockDim.x + threadIdx.x;
if(j>=n) return;
for(int s=0; s<numsamples; s++)
{
'''
for k in range(p):
loopcode='''
y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k);
'''
if unroll_filterorder:
for i in range(m-2):
loopcode+=re.sub('\\bi\\b', str(i), '''
zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
''')
else:
loopcode+='''
for(int i=0;i<m-2;i++)
zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
'''
loopcode+='''
zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j);
'''
if k<p-1:
loopcode+='''
x(s,j) = y(s,j);
'''
loopcode=re.sub('\\bk\\b', str(k), loopcode)
code+=loopcode
code+='''
}
}
'''
code=code.replace('SCALAR', self.precision)
code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values
code=re.sub("\\bm\\b", str(m), code)
code=re.sub("\\bn\\b", str(n), code)
#print code
self.gpu_mod=pycuda.compiler.SourceModule(code)
self.gpu_filt_func=self.gpu_mod.get_function("filt")
blocksize=256
if n<blocksize:
blocksize=n
if n%blocksize==0:
gridsize=n/blocksize
else:
gridsize=n/blocksize+1
self.block=(blocksize, 1, 1)
self.grid=(gridsize, 1)
self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block)
self._has_run_once=False
开发者ID:sivaven,项目名称:brian,代码行数:97,代码来源:gpulinearfilterbank.py
示例14: main
#.........这里部分代码省略.........
#dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
#for i in range(0, GenomeDim/4):
#dest[i*8 + 0] = int('0b00100101',2) #CRASHES
#dest[i*8 + 1] = int('0b00010000',2) #CRASHES
#dest[i*8 + 0] = int('0b00101000',2)
#dest[i*8 + 1] = int('0b00000000',2)
#dest[i*8 + 2] = int('0b00000000',2)
#dest[i*8 + 3] = int('0b00000000',2)
#dest[i*8 + 4] = int('0b00000000',2)
#dest[i*8 + 5] = int('0b00000000',2)
#dest[i*8 + 6] = int('0b00000000',2)
#dest[i*8 + 7] = int('0b00000000',2)
# dest[i*4 + 0] = 40
# dest[i*4 + 1] = 0
# dest[i*4 + 2] = 0
# dest[i*4 + 3] = 0
dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes)
#drv.memcpy_htod(dest_h, dest)
#print "Genomes before: "
#print dest
#Set-up grids
#grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
#grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
#drv.memcpy_htod(grids_h, grids)
#print "Grids:"
#print grids
#Set-up fitness values
#fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
#fitness_h = drv.mem_alloc(fitness.nbytes)
#fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32)
fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
fitness_size_h = drv.mem_alloc(fitness_size.nbytes)
#fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32)
fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes)
#drv.memcpy_htod(fitness_h, fitness)
#print "Fitness values:"
#print fitness
#Set-up grids
#grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0)
grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
#drv.memcpy_htod(grids_h, grids)
#print "Grids:"
#print grids
#Set-up curand
#curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
#curand_h = drv.mem_alloc(curand.nbytes)
curand_h = drv.mem_alloc(40*GenomeDim)
#SearchSpace control
#SearchSpaceSize = 2**24
#BlockDimY = SearchSpaceSize / (2**16)
#BlockDimX = SearchSpaceSize / (BlockDimY)
#print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
#Schedule kernel calls
#MaxBlockDim = 100
OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize)
MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize)
开发者ID:schroeder-dewitt,项目名称:polyomino-self-assembly,代码行数:67,代码来源:main.py
示例15: prepare
def prepare(self):
'''
From Hines 1984 paper, discrete formula is:
A_plus*V(i+1)-(A_plus+A_minus)*V(i)+A_minus*V(i-1)=Cm/dt*(V(i,t+dt)-V(i,t))+gtot(i)*V(i)-I0(i)
A_plus: i->i+1
A_minus: i->i-1
This gives the following tridiagonal system:
A_plus*V(i+1)-(Cm/dt+gtot(i)+A_plus+A_minus)*V(i)+A_minus*V(i-1)=-Cm/dt*V(i,t)-I0(i)
Boundaries, one simple possibility (sealed ends):
-(Cm/dt+gtot(n)+A_minus)*V(n)+A_minus*V(n-1)=-Cm/dt*V(n,t)-I0(n)
A_plus*V(1)-(Cm/dt+gtot(0)+A_plus)*V(0)=-Cm/dt*V(0,t)-I0(0)
'''
mid_diameter = zeros(len(self.neuron)) # mid(i) : (i-1) <-> i
mid_diameter[1:] = .5*(self.neuron.diameter[:-1]+self.neuron.diameter[1:])
self.Aplus = zeros(len(self.neuron)) # A+ i -> j = Aplus(j)
self.Aminus = zeros(len(self.neuron)) # A- i <- j = Aminus(j)
self.Aplus[1]= mid_diameter[1]**2/(4*self.neuron.diameter[1]*self.neuron.length[1]**2*self.neuron.Ri)
self.Aplus[2:]=mid_diameter[2:]**2/(4*self.neuron.diameter[1:-1]*self.neuron.length[1:-1]**2*self.neuron.Ri)
self.Aminus[1:]=mid_diameter[1:]**2/(4*self.neuron.diameter[1:]*self.neuron.length[1:]**2*self.neuron.Ri)
self.neuron.index = zeros(len(self.neuron),int) # gives the index of the branch containing the current compartment
self.neuron.branches = [] # (i,j,bp,ante,ante_index,pointType)
# i is the first compartment
# bp is the last, a branch point
# j is the end of the "inner branch". j = bp-1
# ante is the branch point to which i is connected
self.neuron.BPcount = 0 # number of branch points (or branches). = len(self.neuron.branches)
self.neuron.long_branches_count = 0 # number of branches with len(branch) > 1
#self.vL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
#self.vR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
#self.d = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
self.bL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
self.bR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
#self.bd = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
self.ab = zeros((3,len(self.neuron)))
self.ab0 = zeros(len(self.neuron))
self.ab1 = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
self.ab2 = zeros(len(self.neuron))
self.ab1_base = zeros(len(self.neuron))
#self.res = cuda.pagelocked_zeros((3 * len(self.neuron)),numpy.float64)
self.mTrunc = 0 # used to truncate vL and vR
self.delta_list = zeros(len(self.neuron)) #used to find mTrunc
# prepare_branch : fill neuron.index, neuron.branches, changes Aplus & Aminus
self.prepare_branch(self.neuron.morphology, mid_diameter,0)
# linear system P V = B used to deal with the voltage at branch points and take boundary conditions into account.
self.P = zeros((self.neuron.BPcount,self.neuron.BPcount))
self.B = zeros(self.neuron.BPcount)
self.solution_bp = zeros(self.neuron.BPcount)
self.gtot = zeros(len(self.neuron))
self.I0 = zeros(len(self.neuron))
self.i_list = []
self.j_list = []
self.i_list_bis = []
self.j_list_bis = []
new_tridiag = True
self.bp_list = []
self.pointType_list = []
self.pointTypeAnte_list = []
self.index_ante_list0 = []
self.index_ante_list1 = []
self.index_ante_list2 = []
self.ante_list = []
self.post_list = []
self.ante_list_idx = []
self.post_list_idx = []
self.id = []
self.test_list = []
temp = zeros(self.neuron.BPcount)
self.ind0 = []
self.ind_bctype_0 = []
for index,(i,j,bp,ante,index_ante,pointType) in enumerate(self.neuron.branches) :
self.i_list.append(i)
self.j_list.append(j)
if new_tridiag:
self.i_list_bis.append(i)
ii = i
else:
ii = self.i_list[-1]
if j-ii+1>2:
self.j_list_bis.append(j)
new_tridiag = True
else :
new_tridiag = False
self.bp_list.append(bp)
self.pointType_list.append(max(1,pointType))
self.pointTypeAnte_list.append(max(1,self.neuron.bc[ante]))
temp[index] = index_ante
self.id.append(index)
if (j-i+2>1):
#.........这里部分代码省略.........
开发者ID:JoErNanO,项目名称:brian,代码行数:101,代码来源:spatialstateupdater_linear.py
示例16: alloc_async_host_buf
def alloc_async_host_buf(self, shape, dtype):
"""Allocates a buffer that can be used for asynchronous data
transfers."""
return cuda.pagelocked_zeros(shape, dtype=dtype)
开发者ID:mjanusz,项目名称:sailfish,代码行数:4,代码来源:backend_cuda.py
示例17: show
sys.stdout.flush()
start.record()
if rank == 0:
print "\navg: %1.2f GFLOPS" % flops[2:-2].mean()
if rank == 1:
total = np.zeros(tmax)
for key in exec_time.iterkeys():
total[:] += exec_time[key][:]
for key in exec_time.iterkeys():
print key, ":\t %1.2f %%" % (exec_time[key][2:-2].sum() / total[2:-2].sum() * 100)
print "%1.2f GFLOPS\r" % ((tmax - 4) * 3 * nx * ny * nz * 30 / total[2:-2].sum() * 1e-6)
g = cuda.pagelocked_zeros((nx, ny, nz), "f")
cuda.memcpy_dtoh(g, ez_gpu)
if rank != 0:
comm.Send(g, 0, 24)
else:
lg = np.zeros((3 * nx, ny), "f")
lg[:nx, :] = g[:, :, nz / 2]
comm.Recv(g, 1, 24)
lg[nx:-nx, :] = g[:, :, nz / 2]
comm.Recv(g, 2, 24)
lg[2 * nx :, :] = g[:, :, nz / 2]
imsh.set_array(lg.T ** 2)
show() # draw()
# savefig('./png-wave/%.5d.png' % tstep)
stop.record()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:150-nGPU-func.py
示例18: range
result = cuda.from_device(a_gpu, (nx,ny), 'float32')
print ngpu
for i in range(1,ngpu):
result = np.concatenate((result, mpi.world.recv(i,10)))
for i in xrange(ny):
print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]
if __name__ == '__main__':
cuda.init()
ngpu = cuda.Device.count()
ctx = cuda.Device(mpi.rank).make_context(cuda.ctx_flags.MAP_HOST)
nx, ny = 6, 5
a_side_f = cuda.pagelocked_zeros(ny, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
a = np.zeros((nx,ny),'f')
if mpi.rank == 0:
a[-2,:] = 1.5
elif mpi.rank == 1:
a[1,:] = 2.0
a[-2,:] = 2.5
elif mpi.rank == 2:
a[1,:] = 3.0
a_gpu = cuda.to_device(a)
if mpi.rank == 0: print 'dev 0','\t'*5,'dev 1','\t'*5,'dev 2'
print_arr_gpus(ngpu, nx, ny, a_gpu)
if mpi.rank == 0:
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:011-mpi-exchange-zero_copy.py
示例19: set_c
ey_gpu = cuda.to_device(f)
ez_gpu = cuda.to_device(f)
hx_gpu = cuda.to_device(f)
hy_gpu = cuda.to_device(f)
hz_gpu = cuda.to_device(f)
cex_gpu = cuda.to_device( set_c(f,(None,-1,-1)) )
cey_gpu = cuda.to_device( set_c(f,(-1,None,-1)) )
cez_gpu = cuda.to_device( set_c(f,(-1,-1,None)) )
chx_gpu = cuda.to_device( set_c(f,(None,0,0)) )
chy_gpu = cuda.to_device( set_c(f,(0,None,0)) )
chz_gpu = cuda.to_device( set_c(f,(0,0,None)) )
# pinned memory allocation for zero-copy
if myrank != 1:
ex_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
ey_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
hx_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
hy_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
'''
ex_send_map = ex_send.get_device_pointer()
ey_send_map = ey_send.get_device_pointer()
hx_recv_map = hx_recv.get_device_pointer()
hy_recv_map = hy_recv.get_device_pointer()
'''
if myrank != 3:
ex_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
ey_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
hx_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
hy_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F', mem_flags=cuda.host_alloc_flags.DEVICEMAP)
'''
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:040-block1d-texture-smem-if-over_maxgrid-mpi-zero_copy.py
示例20: SourceModule
# Setup the kernel
mod = SourceModule("""
__global__ void add(float *a, float *b, float *c, float *c_map) {
int idx = blockIdx.x*blockDim.x + threadIdx.x;
float val;
val = a[idx] + b[idx];
c[idx] = val;
c_map[idx] = val;
}
""")
add = mod.get_function("add")
# Memory allocation
nx = 1024
a = np.random.randn(nx).astype(np.float32)
b = np.random.randn(nx).astype(np.float32)
c = np.zeros_like(a)
a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)
# Page-locked host memory allocation for zero-copy
c_map = cuda.pagelocked_zeros(nx, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
add( a_gpu, b_gpu, cuda.Out(c), cuda.Out(c_map), block=(256,1,1), grid=(4,1) )
assert( np.linalg.norm( (a+b)-c ) == 0 )
assert( np.linalg.norm( (a+b)-c_map ) == 0 )
ctx.pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:30,代码来源:zero_copy-test-01.py
注:本文中的pycuda.driver.pagelocked_zeros函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论