本文整理汇总了Python中pycuda.driver.mem_alloc函数的典型用法代码示例。如果您正苦于以下问题:Python mem_alloc函数的具体用法?Python mem_alloc怎么用?Python mem_alloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了mem_alloc函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: diffuse_pycuda
def diffuse_pycuda(u):
nx,ny = np.int32(u.shape)
alpha = np.float32(0.645)
dx = np.float32(3.5/(nx-1))
dy = np.float32(3.5/(ny-1))
dt = np.float32(1e-05)
time = np.float32(0.4)
nt = np.int32(np.ceil(time/dt))
# print nt
u[0,:]=200
u[:,0]=200
u = u.astype(np.float32)
u_prev = u.copy()
u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
cuda.memcpy_htod(u_d, u)
cuda.memcpy_htod(u_prev_d, u_prev)
BLOCKSIZE = 16
gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
blockSize = (BLOCKSIZE,BLOCKSIZE,1)
for t in range(nt+1):
copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
cuda.memcpy_dtoh(u, u_d)
return u
开发者ID:htapia,项目名称:lania.pd,代码行数:34,代码来源:diffuse.py
示例2: get_spharms_l_eq_2
def get_spharms_l_eq_2(theta, phi, selected_Modes_gpu, rslt_gpu):
modelist = np.array(sorted([mode[1] for mode in selected_modes])).astype(np.int32)
modelist_gpu = cuda.mem_alloc(modelist.nbytes)
# nsampslen = np.array(len(theta), ndmin=1).astype(np.int32)
nmodeslen = np.array(len(modelist), ndmin=1).astype(np.int32)
nsamps_gpu = cuda.mem_alloc(nsamps.nbytes)
nmodes_gpu = cuda.mem_alloc(nmodeslen.nbytes)
cuda.memcpy_htod(nsamps_gpu, nsamps)
cuda.memcpy_htod(nmodes_gpu, nmodeslen)
# cuda.memcpy_htod(theta_gpu, theta)
# cuda.memcpy_htod(phi_gpu, phi)
cuda.memcpy_htod(modelist_gpu, modelist)
# Get and compile the cuda function
sph = mod.get_function("compute_sph_harmonics_l_eq_2")
result_gpu = cuda.mem_alloc(theta_m.nbytes * len(modelist) * 2)
blk = (1024,1,1)
grd = (1,1,1)
sph(theta, phi, modelist_gpu, nmodes_gpu, nsamps_gpu, rslt_gpu, block=blk, grid=grd)
# cuda.memcpy_dtoh(result, result_gpu)
# print(result[0:9])
# print(len(result))
return
开发者ID:brandonbm00,项目名称:rapidpe_gpu,代码行数:30,代码来源:sph_harmonics_cu.py
示例3: main
def main():
(h, w), d = (826,1169), 3 #img1.size, len(img1_arr[0][0])
if LINEAR:
thread_x, thread_y, thread_z = 128,1,1
block_x, block_y = (w*h*d)/thread_x, 1
if (w*h*d)%thread_x:
block_x += 1
else:
thread_x, thread_y, thread_z = 16, 8, d
block_x, block_y = h / thread_x, w / thread_y
if h % thread_x:
block_x += 1
if w % thread_y:
block_y += 1
#print (h,w,d), (thread_x,thread_y,thread_z), (block_x,block_y)
image_data_size = 2896782 * 4
a_gpu = cuda.mem_alloc(image_data_size)
b_gpu = cuda.mem_alloc(image_data_size)
c_gpu = cuda.mem_alloc(image_data_size)
image_path_pairs = []
for i in xrange(50):
page_num = i + 1
path1, path2 = 'form1.%d.png'%page_num, 'form2.%d.png'%page_num
image_path_pairs.append((path1,path2))
do_work(image_path_pairs, a_gpu, b_gpu, c_gpu, (thread_x, thread_y, thread_z), (block_x, block_y))
开发者ID:B-Rich,项目名称:python_scripts,代码行数:28,代码来源:accelerated.py
示例4: __compute_sub_gaussian_gpu
def __compute_sub_gaussian_gpu(self, sub_partitions):
if sub_partitions < 1:
raise Exception("You can't have less than 1 partition")
elif sub_partitions > self.pts.shape[0]:
raise Exception("sub partitions need to be smaller than pts size")
# Delta Partitions
d_part = self.pts.shape[0]/sub_partitions
# Does the correct partitioning
alloc_size = self.pts.shape[0]/sub_partitions * 2 * self.pts.itemsize
self.pts_gpu = cuda.mem_alloc(alloc_size)
self.pts[:, 0] = (self.pts[:, 0] - self.axis[0])/(self.axis[1] - self.axis[0])
self.pts[:, 1] = (self.pts[:, 1] - self.axis[2])/(self.axis[3] - self.axis[2])
for partition in range(sub_partitions):
sub_pts = self.pts[partition*d_part:(partition+1)*d_part, :]
self.__compute_guassian_on_pts(sub_pts)
self.pts_gpu.free()
# See's if there is a remainder of points to work with
if self.pts.shape[0] % sub_partitions:
alloc_size = (self.pts.shape[0] % sub_partitions) * (2 * self.pts.itemsize)
self.pts_gpu = cuda.mem_alloc(alloc_size)
self.__compute_guassian_on_pts(self.pts[sub_partitions*d_part:, :])
self.pts_gpu.free()
开发者ID:SCIInstitute,项目名称:MLM,代码行数:25,代码来源:gaussian_gpu.py
示例5: calc_psd
def calc_psd(self,bitloads,xtalk):
#Number of expected permutations
Ncombinations=self.K
#Check if this is getting hairy and assign grid/block dimensions
(warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)
#How many individual lk's
memdim=blockCount*threadCount
threadshare_grid=(blockCount,1)
threadshare_block=(threadCount,1,1)
#Memory (We get away with the NCombinations because calpsd checks against it)
d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
h_p=np.zeros((self.K,self.N)).astype(self.type)
cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
#Go solve
#__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){
self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
cuda.Context.synchronize()
cuda.memcpy_dtoh(h_p,d_p)
d_a.free()
d_bitload.free()
d_XTG.free()
d_p.free()
return h_p.astype(np.float64)
开发者ID:andrewbolster,项目名称:multiuserDSM,代码行数:32,代码来源:gpu.py
示例6: cuda_crossOver
def cuda_crossOver(sola, solb):
""" """
sol_len = len(sola);
a_gpu = cuda.mem_alloc(sola.nbytes);
b_gpu = cuda.mem_alloc(solb.nbytes);
cuda.memcpy_htod(a_gpu, sola);
cuda.memcpy_htod(b_gpu, solb);
func = mod.get_function("crossOver");
func(a_gpu,b_gpu, block=(sol_len,1,1));
a_new = numpy.empty_like(sola);
b_new = numpy.empty_like(solb);
cuda.memcpy_dtoh(a_new, a_gpu);
cuda.memcpy_dtoh(b_new, b_gpu);
if debug == True:
print "a:", a;
print "b:",b;
print "new a:",a_new;
print "new b:",b_new;
return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py
示例7: alloc
def alloc(self, dim, stream=None):
"""
Ensure that this object's framebuffers are large enough to handle the
given dimensions, allocating new ones if not.
If ``stream`` is not None and a reallocation is necessary, the stream
will be synchronized before the old buffers are deallocated.
"""
nbins = dim.ah * dim.astride
if self.nbins >= nbins:
return
if self.nbins is not None:
self.free()
try:
self.d_front = cuda.mem_alloc(16 * nbins)
self.d_back = cuda.mem_alloc(16 * nbins)
self.d_side = cuda.mem_alloc(16 * nbins)
self.nbins = nbins
except cuda.MemoryError, e:
# If a frame that's too large sneaks by the task distributor, we
# don't want to kill the server, but we also don't want to leave
# it stuck without any free memory to complete the next alloc.
# TODO: measure free mem and only take tasks that fit (but that
# should be done elsewhere)
self.free(stream)
raise e
开发者ID:vincentmele,项目名称:cuburn,代码行数:26,代码来源:render.py
示例8: calc_blob_blob_forces_pycuda
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
# Determine number of threads and blocks for the GPU
number_of_blobs = np.int32(len(r_vectors))
threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)
# Get parameters from arguments
L = kwargs.get('periodic_length')
eps = kwargs.get('repulsion_strength')
b = kwargs.get('debye_length')
blob_radius = kwargs.get('blob_radius')
# Reshape arrays
x = np.reshape(r_vectors, number_of_blobs * 3)
f = np.empty_like(x)
# Allocate GPU memory
x_gpu = cuda.mem_alloc(x.nbytes)
f_gpu = cuda.mem_alloc(f.nbytes)
# Copy data to the GPU (host to device)
cuda.memcpy_htod(x_gpu, x)
# Get blob-blob force function
force = mod.get_function("calc_blob_blob_force")
# Compute mobility force product
force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1))
# Copy data from GPU to CPU (device to host)
cuda.memcpy_dtoh(f, f_gpu)
return np.reshape(f, (number_of_blobs, 3))
开发者ID:stochasticHydroTools,项目名称:RigidMultiblobsWall,代码行数:33,代码来源:forces_pycuda_user_defined.py
示例9: __init__
def __init__(self, max_size, offsets=None):
"""
Create a sorter. The sorter will hold on to internal resources for as
long as it is alive, including an 'offsets' array of size 4*max_size.
To share this cost, you may pass in an array of at least this size to
__init__ (to, for instance, share across different bit-widths in a
multi-pass sort).
"""
self.init_mod()
self.max_size = max_size
assert max_size % self.group_size == 0
max_grids = max_size / self.group_size
if offsets is None:
self.doffsets = cuda.mem_alloc(self.max_size * 4)
else:
self.doffsets = offsets
self.dpfxs = cuda.mem_alloc(max_grids * self.radix_size * 4)
self.dlocals = cuda.mem_alloc(max_grids * self.radix_size * 4)
# There are probably better ways to choose how many condensation
# groups to launch. TODO: maybe pick one if I care
self.ncond = 32
self.dcond = cuda.mem_alloc(self.radix_size * self.ncond * 4)
self.dglobal = cuda.mem_alloc(self.radix_size * 4)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:25,代码来源:sort.py
示例10: prepare_device_arrays
def prepare_device_arrays(self):
self.maxLayers = self.grid_prop.GetMaxLayers()
nczbins_fine = len(self.czcen_fine)
numLayers = np.zeros(nczbins_fine,dtype=np.int32)
densityInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
self.grid_prop.GetNumberOfLayers(numLayers)
self.grid_prop.GetDensityInLayer(densityInLayer)
self.grid_prop.GetDistanceInLayer(distanceInLayer)
# Copy all these earth info arrays to device:
self.d_numLayers = cuda.mem_alloc(numLayers.nbytes)
self.d_densityInLayer = cuda.mem_alloc(densityInLayer.nbytes)
self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
cuda.memcpy_htod(self.d_numLayers,numLayers)
cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)
self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)
return
开发者ID:gkrueckl,项目名称:pisa,代码行数:26,代码来源:Prob3GPUOscillationService.py
示例11: poisson_parallel
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
# create Cheetah template and fill in variables for Poisson kernal
template = Template(poisson_blending_source)
template.BLOCK_DIM_X = b_size[0]
template.BLOCK_DIM_Y = b_size[1]
template.WIDTH = dest_im.shape[1]
template.HEIGHT = dest_im.shape[0]
template.RGB = RGB
template.NEIGHBORS = neighbors
# compile the CUDA kernel
poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")
# alloc memory in GPU
out_image = np.array(dest_im, dtype =np.uint8)
d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
cu.memcpy_htod(d_source, source_im)
cu.memcpy_htod(d_destination, dest_im)
cu.memcpy_htod(d_buffer, interior_buffer)
# calls CUDA for Poisson Blending n # of times
for i in range(n):
poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)
# retrieves the final output image and returns
cu.memcpy_dtoh(out_image, d_destination)
return out_image
开发者ID:JMTing,项目名称:cs205,代码行数:27,代码来源:parallel_poisson.py
示例12: __init__
def __init__(self, init_data, n_generators):
self.ctx = curr_gpu.make_context()
self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
(free, total) = cuda.mem_get_info()
print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
print(("Global free memory :%i Mo free" % (free / 10 ** 6)))
################################################################################################################
self.width_mat = np.int32(init_data.shape[0])
# self.gpu_init_data = ga.to_gpu(init_data)
self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
cuda.memcpy_htod(self.gpu_init_data, init_data)
self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
(free, total) = cuda.mem_get_info()
print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
print(("Global free memory :%i Mo free" % (free / 10 ** 6)))
self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
# self.gpu_new_data = ga.to_gpu(self.cpu_new_data)
self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
# self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
################################################################################################################
self.init_rng = self.module.get_function("init_rng")
self.gen_rand_mat = self.module.get_function("gen_rand_mat")
self.sum_along_axis = self.module.get_function("sum_along_axis")
self.norm_along_axis = self.module.get_function("norm_along_axis")
self.init_vect_sum = self.module.get_function("init_vect_sum")
self.copy_mat = self.module.get_function("copy_mat")
################################################################################################################
self.n_generators = n_generators
seed = 1
self.rng_states = cuda.mem_alloc(
n_generators
* characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
)
self.init_rng(
np.int32(n_generators),
self.rng_states,
np.uint64(seed),
np.uint64(0),
block=(64, 1, 1),
grid=(n_generators // 64 + 1, 1),
)
(free, total) = cuda.mem_get_info()
size_block_x = 32
size_block_y = 32
n_blocks_x = int(self.width_mat) // (size_block_x) + 1
n_blocks_y = int(self.width_mat) // (size_block_y) + 1
self.grid = (n_blocks_x, n_blocks_y, 1)
self.block = (size_block_x, size_block_y, 1)
开发者ID:koszullab,项目名称:centroID,代码行数:59,代码来源:cuda_lib.py
示例13: confirmInitialization
def confirmInitialization(featuresForSOM,somMatrix):
#allocate memory for the somcuda on the device
somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes)
somBytesPerRow = np.int32(somMatrix.strides[0])
somNumberOfRows = np.int32(somMatrix.shape[0])
somNumberOfColumns = np.int32(somMatrix.shape[1])
pycuda.memcpy_htod(somMatrixPtr,somMatrix)
#allocate space for bmu index
bmu = np.zeros(somMatrixRows).astype(np.float32)
bmuPtr = pycuda.mem_alloc(bmu.nbytes)
pycuda.memcpy_htod(bmuPtr,bmu)
bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
intraDayOffset = features.columns.get_loc('Ret_121')
dayOffset = features.columns.get_loc('Ret_PlusOne')
objVal = 0.0;
objSampSize=0.0
r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)]
nodeHitMatrix = np.array(r).astype(np.float32)
hitCountDict = defaultdict(list)
samples = [x for x in range (0, somMatrixRows*somMatrixColumns)]
if len(samples) >= len(featuresForSOM):
samples = [x for x in range (0, len(featuresForSOM))]
for i in samples:
feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
featuresPtr = pycuda.mem_alloc(feats.nbytes)
pycuda.memcpy_htod(featuresPtr,feats)
#find the BMU
computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
pycuda.memcpy_dtoh(bmu,bmuPtr)
pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
block = np.argmin(bmu)
thread = bmuIndex[block]
val = hitCountDict[(block,thread)]
if val == None or len(val) == 0:
hitCountDict[(block,thread)] = [1,i]
else:
hitCountDict[(block,thread)][0] += 1
val = np.int32(hitCountDict[(block,thread)])[0]
if val == 1:
val = 0x0000ff00
elif val <= 10:
val = 0x000000ff
elif val <= 100:
val = 0x00ff0000
else:
val = 0x00ffffff
bval = (val & 0x000000ff)
gval = ((val & 0x0000ff00) >> 8)
rval = ((val & 0x00ff0000) >> 16)
nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
fig20 = plt.figure(20,figsize=(6*3.13,4*3.13))
fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20)
ax = plt.subplot(111)
somplot = plt.imshow(nodeHitMatrix,interpolation="none")
plt.show()
plt.pause(0.1)
开发者ID:kdkoadd,项目名称:Self-Organizing-Map,代码行数:58,代码来源:sommap.py
示例14: computeAvgDistancetoBMU
def computeAvgDistancetoBMU(currentIter,iterationDistance, features, nodeHitMatrix, somMatrixPtr, somMatrix, featureStatsMatrix, featuresPtr, featureCount, somBytesPerRow, somNumberOfRows, somNumberOfColumns):
adjustNodes = {}
sampSize = 0
cumDistance = 0.0
nodeHitMatrix.fill(0)
hitCountDict.clear()
if len(featuresForSOM) < 100:
sampSize = len(featuresForSOM)
elif currentIter < len(featuresForSOM):
sampSize = int(currentIter)
if sampSize == 0:
sampSize = min(somNumberOfRows*somNumberOfColumns,len(featuresForSOM))
else:
sampSize = len(featuresForSOM)
samples = [x for x in range (0,sampSize)]
#allocate space for bmu
bmu = np.zeros(somMatrixRows).astype(np.float32)
bmuPtr = pycuda.mem_alloc(bmu.nbytes)
pycuda.memcpy_htod(bmuPtr,bmu)
#allocate space for bmu index
bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
for i in samples:
feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
featuresPtr = pycuda.mem_alloc(feats.nbytes)
pycuda.memcpy_htod(featuresPtr,feats)
#find the BMU
computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(featureCount), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
pycuda.memcpy_dtoh(bmu,bmuPtr)
pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
cumDistance += np.min(bmu)
block = np.argmin(bmu)
thread = bmuIndex[block]
adjustNodes[i]=[block,thread]
val = hitCountDict[(block,thread)]
if val == None or len(val) == 0:
hitCountDict[(block,thread)] = [1,i]
else:
hitCountDict[(block,thread)][0] += 1
val = np.int32(hitCountDict[(block,thread)])[0]
if val == 1:
val = 0x0000ff00
elif val <= 10:
val = 0x000000ff
elif val <= 100:
val = 0x00ff0000
else:
val = 0x00ffffff
bval = (val & 0x000000ff)
gval = ((val & 0x0000ff00) >> 8)
rval = ((val & 0x00ff0000) >> 16)
nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
iterationDistance.append(cumDistance/sampSize)
iterationCount.append(currentIter)
return cumDistance/sampSize
开发者ID:kdkoadd,项目名称:Self-Organizing-Map,代码行数:56,代码来源:sommap.py
示例15: set_refsmiles
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
"""Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.
Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
"""
# Set up lingo and count matrices on device #{{{
if self.usePycudaArray:
# Set up using PyCUDA CUDAArray support
self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
self.gpu.tex2lr.set_array(self.gpu.rsmiles)
self.gpu.tex2cr.set_array(self.gpu.rcounts)
else:
# Manually handle setup
temprlmat = self._padded_array(refsmilesmat)
if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)
temprcmat = self._padded_array(refcountsmat)
self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)
descriptor = cuda.ArrayDescriptor()
descriptor.width = temprcmat.shape[1]
descriptor.height = temprcmat.shape[0]
descriptor.format = cuda.array_format.UNSIGNED_INT32
descriptor.num_channels = 1
self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
self.gpu.stream.synchronize()
del temprlmat
del temprcmat
#}}}
self.rlengths = reflengths
self.rshape = refsmilesmat.shape
self.nref = refsmilesmat.shape[0]
# Copy reference lengths to GPU
self.gpu.rl_gpu = cuda.to_device(reflengths)
# Allocate buffers for query set magnitudes
self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
if refmags is not None:
cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
else:
# Calculate query set magnitudes on GPU
magthreads = 256
self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
return
开发者ID:ihaque,项目名称:SIML,代码行数:56,代码来源:GPULingo.py
示例16: gfx_init
def gfx_init( self ) :
try :
print 'compiling'
self.prog = sh.compile_program_vfg( 'shad/balls' )
print 'compiled'
self.loc_mmv = sh.get_loc(self.prog,'modelview' )
self.loc_mp = sh.get_loc(self.prog,'projection')
self.l_color = sh.get_loc(self.prog,'color' )
self.l_size = sh.get_loc(self.prog,'ballsize' )
except ValueError as ve :
print "Shader compilation failed: " + str(ve)
sys.exit(0)
# glUseProgram( self.prog )
# glUniform1i( pointsid , 0 );
# glUseProgram( 0 )
#
# cuda init
#
self.grid = (int(self.BOX),int(self.BOX))
self.block = (1,1,int(self.BOX))
print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid))
# print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK
# print cuda_driver.device_attribute.MAX_BLOCK_DIM_X
# print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y
# print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z
floatbytes = np.dtype(np.float32).itemsize
self.gpos = glGenBuffers(1)
glBindBuffer( GL_ARRAY_BUFFER , self.gpos )
glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW )
glBindBuffer( GL_ARRAY_BUFFER , 0 )
self.df1 = cuda_driver.mem_alloc( self.f.nbytes )
self.df2 = cuda_driver.mem_alloc( self.f.nbytes )
cuda_driver.memcpy_htod( self.df1 , self.f )
cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q )
mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' )
self.collision = mod.get_function("collision_step")
self.collision.prepare( "Piii" )
self.streaming = mod.get_function("streaming_step")
self.streaming.prepare( "PPiii" )
self.colors = mod.get_function("colors")
self.colors.prepare( "PPiii" )
开发者ID:jkotur,项目名称:particles,代码行数:55,代码来源:lbm.py
示例17: _initME
def _initME(self):
"""Initializes the MotionEnergy CUDA functions."""
logging.debug('initME')
# register all device functions for easy access
# imported from motion_energy_device.py
self.dev_conv1 = mod.get_function("dev_conv1")
self.dev_convn = mod.get_function("dev_convn")
self.dev_accumDiffStims = mod.get_function("dev_accumDiffStims")
self.dev_filt2dir = mod.get_function("dev_filt2dir")
self.dev_edges = mod.get_function("dev_edges")
self.dev_fullRect2 = mod.get_function("dev_fullRect2")
self.dev_mean3 = mod.get_function("dev_mean3")
self.dev_normalize = mod.get_function("dev_normalize")
self.dev_split_gray = mod.get_function("dev_split_gray")
self.dev_split_RGB = mod.get_function("dev_split_RGB")
self.dev_sub = mod.get_function("dev_sub")
self.dev_ave = mod.get_function("dev_ave")
self.dev_sum = mod.get_function("dev_sum")
self.dev_scaleHalfRect = mod.get_function("dev_scaleHalfRect")
self.dev_scale = mod.get_function("dev_scale")
self.dev_split_gray = mod.get_function("dev_split_gray")
self.dev_split_RGB = mod.get_function("dev_split_RGB")
self.dev_memcpy_dtod = mod.get_function("dev_memcpy_dtod")
# for quick access: the size in bytes of nrX*nrY floats
self.szXY = self.sizeofFloat * self.nrX * self.nrY
# V1 filter responses
self.d_resp = cuda.mem_alloc(self.szXY*self.nrFilters*self.nrScales)
# V1 complex cell responses
self.d_respV1c = cuda.mem_alloc(self.szXY*self.nrDirs)
# stim frame
self.d_stim = cuda.mem_alloc(self.szXY*self.nrC)
# stim frame buffer (last nrT frames)
self.d_stimBuf = cuda.mem_alloc(self.szXY*self.nrT)
# I'm not sure if this memset works as expected... for now, memcpy an
# array of zeros
# cuda.memset_d32(self.d_stimBuf, 0, self.nrX*self.nrY*self.nrT)
tmp = np.zeros(self.nrX*self.nrY*self.nrT).astype(np.float32)
cuda.memcpy_htod(self.d_stimBuf, tmp)
self.d_diffV1GausBufT = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
self.d_scalingStimBuf = cuda.mem_alloc(self.szXY*self.nrT)
self.d_v1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
self.d_diffV1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
self.d_pop = cuda.mem_alloc(self.szXY*self.nrScales)
self.d_scalingFilt = mod.get_global("d_scalingFilt")[0]
self.d_v1GaussFilt = mod.get_global("d_v1GaussFilt")[0]
self.d_complexV1Filt = mod.get_global("d_complexV1Filt")[0]
self.d_normV1filt = mod.get_global("d_normV1filt")[0]
self.d_diff1filt = mod.get_global("d_diff1filt")[0]
self.d_diff2filt = mod.get_global("d_diff2filt")[0]
self.d_diff3filt = mod.get_global("d_diff3filt")[0]
开发者ID:UCI-CARL,项目名称:MotionEnergy,代码行数:59,代码来源:motionenergy.py
示例18: CudaRPN
def CudaRPN(inPath, outPath, mycode, mydata, **kw):
"""CudaRPN implements the interface to the CUDA run environment.
"""
verbose = kw.get('verbose', False)
BLOCK_SIZE = 1024 # Kernel grid and block size
STACK_SIZE = 64
# OFFSETS = 64
# unary_operator_names = {'plus': '+', 'minus': '-'}
function = Function(
start=len(hardcase),
bss=64,
handcode=kw.get('handcode'))
with Timing('Total execution time'):
with Timing('Get and convert image data to gpu ready'):
im = Image.open(inPath)
px = array(im).astype(float32)
function.assemble(mycode, mydata, verbose=True)
function.disassemble(verbose=True)
cx = array(function.final).astype(int32)
dx = array(function.data).astype(float32)
with Timing('Allocate mem to gpu'):
d_px = mem_alloc(px.nbytes)
memcpy_htod(d_px, px)
d_cx = mem_alloc(cx.nbytes)
memcpy_htod(d_cx, cx)
d_dx = mem_alloc(dx.nbytes)
memcpy_htod(d_dx, dx)
with Timing('Kernel execution time'):
block = (BLOCK_SIZE, 1, 1)
checkSize = int32(im.size[0]*im.size[1])
grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)
kernel = INCLUDE + HEAD + function.body + convolve + TAIL
sourceCode = kernel % {
'pixelwidth': 3,
'stacksize': STACK_SIZE,
'case': function.case}
with open("RPN_sourceCode.c", "w") as target:
print>>target, sourceCode
module = SourceModule(sourceCode)
func = module.get_function("RPN")
func(d_px, d_cx, d_dx, checkSize, block=block, grid=grid)
with Timing('Get data from gpu and convert'):
RPNPx = empty_like(px)
memcpy_dtoh(RPNPx, d_px)
RPNPx = uint8(RPNPx)
with Timing('Save image time'):
pil_im = Image.fromarray(RPNPx, mode="RGB")
pil_im.save(outPath)
# Output final statistics
if verbose:
print '%40s: %s%s' % ('Target image', outPath, im.size)
print Timing.text
开发者ID:jlettvin,项目名称:shmathd,代码行数:53,代码来源:gpu11.py
示例19: add
def add(slice_a, slice_b):
slice_c = np.empty_like(slice_a)
a_gpu = cuda.mem_alloc(slice_a.nbytes)
cuda.memcpy_htod(a_gpu, slice_a)
b_gpu = cuda.mem_alloc(slice_b.nbytes)
cuda.memcpy_htod(b_gpu, slice_b)
c_gpu = cuda.mem_alloc(slice_c.nbytes)
start = time.time()
func(a_gpu, b_gpu, c_gpu, block=(BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()
cuda.memcpy_dtoh(slice_c, c_gpu)
return (slice_c, end-start)
开发者ID:mpitx,项目名称:psychic-octo-ninja,代码行数:12,代码来源:vector_sum_cudampi.py
示例20: stepN
def stepN(self,positions,velocities,n):
x_gpu = cuda.mem_alloc(positions.nbytes)
v_gpu = cuda.mem_alloc(velocities.nbytes)
cuda.memcpy_htod(x_gpu,positions)
cuda.memcpy_htod(v_gpu,velocities)
import numpy as np
self.cuBoris(x_gpu, v_gpu, np.int32(n), block=(1024,1,1), grid=(self.numParts/1024 + 1,1))
cuda.memcpy_dtoh(positions,x_gpu)
cuda.memcpy_dtoh(velocities,v_gpu)
开发者ID:npbarnes,项目名称:Cuda-hybrid-PIC,代码行数:12,代码来源:Electromagnetism.py
注:本文中的pycuda.driver.mem_alloc函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论