本文整理汇总了Python中pycuda.driver.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_compare_order
def test_compare_order():
'''
compare_order between C(row-major), F(column-major)
'''
compare_order = mod_cu.get_function('compare_order')
nx, ny = 3, 4
f_1d = np.arange(nx*ny, dtype='f8')
f_2d_C = f_1d.reshape((nx,ny), order='C')
f_2d_F = f_1d.reshape((nx,ny), order='F')
print ''
print 'f_1d_C\n\n', f_1d
print 'f_2d_C\n', f_2d_C
print 'f_2d_F\n', f_2d_F
print ''
print 'after cuda'
ret_f_1d = np.zeros_like(f_1d)
f_1d_gpu = cuda.mem_alloc_like(f_1d)
f_2d_C_gpu = cuda.to_device(f_2d_C)
compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
print 'f_1d from f_2d_C\n', ret_f_1d
f_2d_F_gpu = cuda.to_device(f_2d_F)
compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
print 'f_1d from f_2d_F\n', ret_f_1d
开发者ID:wbkifun,项目名称:my_stuff,代码行数:31,代码来源:compare_order_C_F.py
示例2: multiply_csr
def multiply_csr(matrix, vector, block_size, repeat=1):
'''
Method multiply matrix by vector using CUDA module for CSR.
Calculation executed on nVidia GPU.
Parameters
==========
matrix : Scipy matrix or numpy array
Matrix to multiplication.
vector : numpy array
Vector to multiplication. His length must equal number of columns
matrix.
block_size : int (recommended 128 or 256)
Size of block CUDA.
repeat : int > 0
Number of repetitions multiplications. It has no effect on
result. Specifies the length of returned list of execution times.
Returns
=======
Tuple of result multiplication and list of execution times.
'''
if len(vector) != matrix.shape[1]:
raise ArithmeticError('Length of the vector is not equal to the'
'number of columns of the matrix.')
matrix = mf.convert_to_scipy_csr(matrix)
data = numpy.array(matrix.data, dtype=numpy.float32)
indices = numpy.array(matrix.indices, dtype=numpy.int32)
indptr = numpy.array(matrix.indptr, dtype=numpy.int32)
data = cuda.to_device(data)
indices = cuda.to_device(indices)
indptr = cuda.to_device(indptr)
num_rows = matrix.shape[0]
result = numpy.zeros(num_rows, dtype=numpy.float32)
time_list = []
grid_size = int(numpy.ceil((num_rows+0.0)/block_size))
block = (block_size, 1, 1)
grid = (grid_size, 1)
g_vector = cuda.to_device(vector)
num_rows = numpy.int32(num_rows)
kernel, texref = cudacodes.get_cuda_csr(block_size=block_size)
texref.set_address(g_vector, vector.nbytes)
tex = [texref]
for _ in range(repeat):
start.record()
kernel(data,
indices,
indptr,
cuda.Out(result),
num_rows,
block=block,
grid=grid,
texrefs=tex)
end.record()
end.synchronize()
time_list.append(start.time_till(end))
return (result, time_list)
开发者ID:fivitti,项目名称:SMDV,代码行数:60,代码来源:matrixmultiplication.py
示例3: allocation
def allocation(self):
super(DGModalGpu, self).allocation()
self.ul_gpu = cuda.to_device(self.ul)
self.ul_prev_gpu = cuda.to_device(self.ul)
self.ul_tmp_gpu = cuda.to_device(self.ul)
self.kl_gpu = cuda.to_device(self.ul)
self.el_sum_gpu = cuda.to_device(np.zeros(self.ne))
开发者ID:wbkifun,项目名称:my_research,代码行数:7,代码来源:dg_modal_gpu.py
示例4: test_simple_kernel_2
def test_simple_kernel_2(self):
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = np.random.randn(400).astype(np.float32)
b = np.random.randn(400).astype(np.float32)
a_gpu = drv.to_device(a)
b_gpu = drv.to_device(b)
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), a_gpu, b_gpu,
block=(400, 1, 1))
assert la.norm(dest-a*b) == 0
drv.Context.synchronize()
# now try with offsets
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu,
block=(399, 1, 1))
assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:30,代码来源:test_driver.py
示例5: train_gpu
def train_gpu(self, num_iter, model_file_path):
if self.batch == 0:
# Prepare to send the numpy array to gpu
self.syn1_gpu = cuda.to_device(self.syn1)
# Create word idx and related data-structure.
self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
word_rep_ptr = int(self.base_word_rep)
self.word_reps = {}
for w_idx, word in sorted(self.dictionary.items()):
word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
word_point = self.words_rep[word][1].astype(dtype=np.int32)
self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
word_rep_ptr += WordRep.memsize
print "GPU transfers done."
self.sent_reps_gpu = cuda.to_device(self.sent_reps)
# Prepare sentences for GPU transfer.
idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
for sentence in self.sentences]
# Prepare the kernel function
kernel = self.kernel_str.get_function("train_sg")
words = np.empty(self.num_sents, dtype=np.int32)
# sent_reps = np.copy(self.sent_reps)
for iter in range(num_iter):
# Sample words for each sentence and transfer to GPU
for s_idx in range(self.num_sents):
words[s_idx] = random.choice(idx_sentences[s_idx])
words_gpu = cuda.to_device(words)
kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
# autoinit.context.synchronize()
self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
pickle_dump(self.sent_reps, model_file_path)
开发者ID:ustbliubo2014,项目名称:DeepLearn,代码行数:35,代码来源:paragraph_vector.py
示例6: get_phir_gpu
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):
REAL = par_reac.REAL
Nq = len(field.xq)
N = len(XK)
MV = numpy.zeros(len(XK))
L = numpy.sqrt(2*surface.Area) # Representative length
AI_int = 0
# Setup vector
K = par_reac.K
tic = time.time()
w = getWeights(K)
X_V = numpy.zeros(N*K)
X_Kx = numpy.zeros(N*K)
X_Ky = numpy.zeros(N*K)
X_Kz = numpy.zeros(N*K)
X_Kc = numpy.zeros(N*K)
X_Vc = numpy.zeros(N*K)
for i in range(N*K):
X_V[i] = XV[i/K]*w[i%K]*surface.Area[i/K]
X_Kx[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
X_Ky[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
X_Kz[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
X_Kc[i] = XK[i/K]
X_Vc[i] = XV[i/K]
toc = time.time()
time_set = toc - tic
sort = surface.sortSource
phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
m_gpu = cuda.to_device(X_V[sort].astype(REAL))
mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL))
my_gpu = cuda.to_device(X_Ky[sort].astype(REAL))
mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL))
mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
xkDev = cuda.to_device(surface.xk.astype(REAL))
wkDev = cuda.to_device(surface.wk.astype(REAL))
get_phir = kernel.get_function("get_phir")
GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))
get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu,
surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev,
numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))
AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
AI_int = numpy.sum(AI_aux)
phir_cpu = numpy.zeros(Nq, dtype=REAL)
phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)
return phir_cpu, AI_int
开发者ID:cdcooper84,项目名称:pygbe,代码行数:59,代码来源:projection.py
示例7: __init__
def __init__(self, code, point, struct_ptr):
self.code = cuda.to_device(code)
self.point = cuda.to_device(point)
self.code_shape, self.code_dtype = code.shape, code.dtype
self.point_shape, self.point_dtype = point.shape, point.dtype
cuda.memcpy_htod(int(struct_ptr), np.int32(code.size))
cuda.memcpy_htod(int(struct_ptr) + 8, np.intp(int(self.code)))
cuda.memcpy_htod(int(struct_ptr) + 8 + np.intp(0).nbytes, np.intp(int(self.point)))
开发者ID:Huskyeder,项目名称:ParagraphVec,代码行数:8,代码来源:paragraph_vector.py
示例8: sync_to_device
def sync_to_device(self):
self.object_array = np.array([f.as_array()
for f in self.object_list])
self.d_object_array = cuda.to_device(self.object_array)
self.d_object_count = cuda.to_device(np.array([self.object_count],
dtype=np.int32))
self.device_ptr = cuda.to_device(np.array([self.d_object_array,
self.d_object_count],
dtype=np.intp))
return self.device_ptr
开发者ID:cfobel,项目名称:pycuda_helpers,代码行数:10,代码来源:struct_container.py
示例9: nlargest
def nlargest(self, n):
"""Returns the per-individual threshold above which there are n outputs.
@param n: number of outputs which should be above the threshold
@type params: int
@return list of thresholds, in order of individuals, which delimit the top
n output values
"""
log.debug("enter nlargest with n=%d", n)
# Find one more output so that we can use strictly-less-than when counting
# and underestimate lift rather than overestimating it.
n = n + 1
passSizes = []
while n > 0:
nextSize = min(self.maxHeapFloats, n)
passSizes.append(nextSize)
n -= nextSize
log.debug("pass sizes: %r", passSizes)
thresholdsMat = np.ones(shape=(self.popSize,),
dtype=np.float32) * np.inf
self.thresholds = driver.to_device(thresholdsMat)
uintBytes = np.dtype(np.uint32).itemsize
thresholdCounts = np.zeros(shape=(self.popSize,),
dtype=np.uint32)
self.thresholdCounts = driver.to_device(thresholdCounts)
for passSize in passSizes:
log.debug("begin pass size %d", passSize)
self.nlargestKernel.prepared_call(self.nlargestGridDim,
self.outputs,
self.trainSet.size,
self.popSize,
passSize,
self.thresholds,
self.thresholdCounts)
driver.Context.synchronize()
if log.isEnabledFor(logging.DEBUG):
thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
log.debug("thresholds: %s", str(thresholdsMat))
thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts)
log.debug("thresholdCounts: %s", str(thresholdCounts))
self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
return self.thresholdsMat
开发者ID:cpatulea,项目名称:evolution,代码行数:53,代码来源:ann.py
示例10: P2PKt_gpu
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu,
surf, LorY, w, param, timing, kernel):
if param.GPU==1:
tic = cuda.Event()
toc = cuda.Event()
else:
tic = Event()
toc = Event()
tic.record()
REAL = param.REAL
mDev = cuda.to_device(m.astype(REAL))
mKtcDev = cuda.to_device(mKtc.astype(REAL))
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
tic.record()
GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
directKt_gpu = kernel.get_function("P2PKt")
AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))
# GPU arrays are flattened, need to point to first element
ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays
ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays
directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu,
surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev,
surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev,
surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list),
numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int,
surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))
toc.record()
toc.synchronize()
timing.time_P2P += tic.time_till(toc)*1e-3
tic.record()
AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
timing.AI_int += sum(AI_aux[surfTar.unsort])
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
return Ktx_gpu, Kty_gpu, Ktz_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:52,代码来源:FMMutils.py
示例11: K
def K(self, Q, P, angles, quadratures):
drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
Nx = Q.shape[0]
Ny = int(floor(quadratures.size / 1024.))
K = scipy.empty((Nx,), dtype=scipy.float32)
Kb = drv.mem_alloc(4*Ny*Nx)
Q_gpu = drv.to_device(Q)
P_gpu = drv.to_device(P)
self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
return K/self.L
开发者ID:martina88esposito,项目名称:tomohowk,代码行数:13,代码来源:tomography_cuda.py
示例12: __init__
def __init__(self):
self.stream = cuda.Stream()
self.pool = pycuda.tools.PageLockedMemoryPool()
self._clear()
# These resources rely on the slots/ringbuffer mechanism for sharing,
# and so can be shared across any number of launches, genomes, and
# render kernels. Notably, seeds are self-synchronizing, so they're not
# attached to either stream object.
self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
self.d_seeds = cuda.to_device(seeds)
self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
self.d_points = cuda.mem_alloc(self._len_d_points)
开发者ID:stevenrobertson,项目名称:cuburn,代码行数:14,代码来源:render.py
示例13: M2P_gpu
def M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, surf, ind0, param, LorY, timing, kernel):
if param.GPU==1:
tic = cuda.Event()
toc = cuda.Event()
else:
tic = Event()
toc = Event()
REAL = param.REAL
tic.record()
M2P_size = surfTar.offsetMlt[surf,len(surfTar.twig)]
MSort = numpy.zeros(param.Nm*M2P_size)
MdSort = numpy.zeros(param.Nm*M2P_size)
i = -1
for C in surfTar.M2P_list[surf,0:M2P_size]:
i+=1
MSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].M
MdSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].Md
# (free, total) = cuda.mem_get_info()
# print 'Global memory occupancy: %f%% free'%(free*100/total)
MDev = cuda.to_device(MSort.astype(REAL))
MdDev = cuda.to_device(MdSort.astype(REAL))
# (free, total) = cuda.mem_get_info()
# print 'Global memory occupancy: %f%% free'%(free*100/total)
# GPU arrays are flattened, need to point to first element
ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays
ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays
GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
multipole_gpu = kernel.get_function("M2P")
multipole_gpu(K_gpu, V_gpu, surfTar.offMltDev, surfTar.sizeTarDev,
surfTar.xcDev, surfTar.ycDev, surfTar.zcDev,
MDev, MdDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev,
ind0.indexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), REAL(param.kappa),
numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), numpy.int32(LorY),
block=(param.BSZ,1,1), grid=(GSZ,1))
toc.record()
toc.synchronize()
timing.time_M2P += tic.time_till(toc)*1e-3
return K_gpu, V_gpu
开发者ID:cdcooper84,项目名称:pygbe,代码行数:48,代码来源:FMMutils.py
示例14: batch_indexing
def batch_indexing(self, planes, data_points):
data_size = data_points.shape[0] / 128
self.benchmark_begin('preparing')
gpu_alloc_objs = []
# for data points
#addresses = []
#for point in data_points:
# point_addr = drv.to_device(point)
# gpu_alloc_objs.append(point_addr)
# addresses.append(int(point_addr))
#np_addresses = numpy.array(addresses).astype(numpy.uint64)
# 64 bit addressing space. each point costs 8 bytes
#arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8)
#drv.memcpy_htod(arrays_gpu, np_addresses)
# for planes
planes_addresses = []
for plane in planes:
plane_addr = drv.to_device(plane)
gpu_alloc_objs.append(plane_addr)
planes_addresses.append(int(plane_addr))
planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64)
# 64 bit addressing space. each point costs 8 bytes
planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8)
drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses)
# projections
projections = numpy.zeros(data_size).astype(numpy.uint64)
length = numpy.array([data_size]).astype(numpy.uint64)
print "total: " + str(data_size) + " data points to indexing."
self.benchmark_end('preparing')
self.benchmark_begin('cudaing')
self.indexing_kernel(
planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length),
block = self.block, grid = self.grid)
self.benchmark_end('cudaing')
#count = 0
#for pro in projections:
# print "count: " + str(count) + " " + str(pro)
# count += 1
#print projections.shape
return projections
开发者ID:viirya,项目名称:fastdict,代码行数:60,代码来源:cuda_indexing.py
示例15: index_list_backend
def index_list_backend(self, ilists):
from pytools import single_valued
ilist_length = single_valued(len(il) for il in ilists)
assert ilist_length == self.plan.dofs_per_face
from cgen import Typedef, POD
from pytools import flatten
flat_ilists_uncast = numpy.array(list(flatten(ilists)))
if numpy.max(flat_ilists_uncast) >= 256:
tp = numpy.uint16
else:
tp = numpy.uint8
flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
assert (flat_ilists == flat_ilists_uncast).all()
return GPUIndexLists(
type=tp,
code=[Typedef(POD(tp, "index_list_entry_t"))],
device_memory=cuda.to_device(flat_ilists),
bytes=flat_ilists.size * flat_ilists.itemsize,
)
开发者ID:gimac,项目名称:hedge,代码行数:26,代码来源:fluxgather.py
示例16: test_multichannel_linear_texture
def test_multichannel_linear_texture(self):
mod = SourceModule("""
#define CHANNELS 4
texture<float4, 1, cudaReadModeElementType> mtx_tex;
__global__ void copy_texture(float *dest)
{
int i = threadIdx.x+blockDim.x*threadIdx.y;
float4 texval = tex1Dfetch(mtx_tex, i);
dest[i*CHANNELS + 0] = texval.x;
dest[i*CHANNELS + 1] = texval.y;
dest[i*CHANNELS + 2] = texval.z;
dest[i*CHANNELS + 3] = texval.w;
}
""")
copy_texture = mod.get_function("copy_texture")
mtx_tex = mod.get_texref("mtx_tex")
shape = (16, 16)
channels = 4
a = np.random.randn(*(shape+(channels,))).astype(np.float32)
a_gpu = drv.to_device(a)
mtx_tex.set_address(a_gpu, a.nbytes)
mtx_tex.set_format(drv.array_format.FLOAT, 4)
dest = np.zeros(shape+(channels,), dtype=np.float32)
copy_texture(drv.Out(dest),
block=shape+(1,),
texrefs=[mtx_tex]
)
#print a
#print dest
assert la.norm(dest-a) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:34,代码来源:test_driver.py
示例17: cls_init
def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n):
"""
Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2.
Parameters
------------
kernel_nr : int
concurrent kernel number
y_cls : array-like
binary class labels (1,-1)
cls1: int
first class number
cls2: int
second class number
cls1_n : int
number of elements of class 1
cls2_n : int
number of elements of class 2
kernel_out : array-like
array for gpu kernel result, size=2*len(y_cls)
"""
warp=32
align_cls1_n = cls1_n+(warp-cls1_n%warp)%warp
align_cls2_n = cls2_n+(warp-cls2_n%warp)%warp
self.cls1_N_aligned=align_cls1_n
sum_cls= align_cls1_n+align_cls2_n
self.sum_cls[kernel_nr] = sum_cls
self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32)
self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32)
self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr])
self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr])
self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb ))
self.g_y[kernel_nr] = cuda.to_device(y_cls)
self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32)
ker_out = self.kernel_out[kernel_nr]
self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out)
开发者ID:ksopyla,项目名称:pyKMLib,代码行数:47,代码来源:GPUKernels.py
示例18: go_sort_old
def go_sort_old(count, stream=None):
data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
ddata = cuda.to_device(data)
print 'Done seeding'
grids = count / 8192
pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
dpfxs = cuda.to_device(pfxs)
launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs,
block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)
#dsplit = cuda.to_device(pfxs)
#launch('crappy_split', dpfxs, dsplit,
#block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)
dsplit = cuda.mem_alloc(grids * 256 * 4)
launch('better_split', dsplit, dpfxs,
block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
#if not stream:
#split = cuda.from_device_like(dsplit, pfxs)
#split_ = cuda.from_device_like(dsplit_, pfxs)
#print np.all(split == split_)
dshortseg_pfxs = cuda.mem_alloc(256 * 4)
dshortseg_sums = cuda.mem_alloc(256 * 4)
launch('prefix_sum', dpfxs, np.int32(grids * 256),
dshortseg_pfxs, dshortseg_sums,
block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1)
dsorted = cuda.mem_alloc(count * 4)
launch('sort_8', ddata, dsorted, dpfxs,
block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)
launch('sort_8_a', ddata, dsorted, dpfxs, dsplit,
block=(32, 32, 1), grid=(grids, 1), stream=stream)
if not stream:
sorted = cuda.from_device(dsorted, (count,), np.int32)
f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
sort_stat = f(rle(sorted))
with open('dev.txt', 'w') as fp: fp.write(sort_stat)
sorted_np = np.sort(data)
np_stat = f(rle(sorted_np))
with open('cpu.txt', 'w') as fp: fp.write(np_stat)
print 'is_sorted?', np.all(sorted == sorted_np)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:47,代码来源:sortbench.py
示例19: make_superblocks
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
from hedge.backends.cuda.tools import pad_and_join
# single_item = [([ block1, block2, ... ], decl), ...]
# multi_item = [([ [ item1, item2, ...], ... ], decl), ...]
multi_blocks = [
["".join(s) for s in part_data]
for part_data, part_decls in multi_item]
block_sizes = [
max(len(b) for b in part_blocks)
for part_blocks in multi_blocks]
from pytools import single_valued
block_count = single_valued(
len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)
from cgen import Struct, ArrayOf
struct_members = []
for part_data, part_decl in single_item:
assert block_count == len(part_data)
single_valued(len(block) for block in part_data)
struct_members.append(part_decl)
for part_data, part_decl in multi_item:
struct_members.append(
ArrayOf(part_decl, max(len(s) for s in part_data)))
superblocks = []
for superblock_num in range(block_count):
data = ""
for part_data, part_decl in single_item:
data += part_data[superblock_num]
for part_blocks, part_size in zip(multi_blocks, block_sizes):
assert block_count == len(part_blocks)
data += pad(part_blocks[superblock_num], part_size)
superblocks.append(data)
superblock_size = devdata.align(
single_valued(len(sb) for sb in superblocks))
data = pad_and_join(superblocks, superblock_size)
assert len(data) == superblock_size*block_count
class SuperblockedDataStructure(Record):
pass
return SuperblockedDataStructure(
struct=Struct(struct_name, struct_members),
device_memory=cuda.to_device(data),
block_bytes=superblock_size,
data=data,
**extra_fields
)
开发者ID:paulcazeaux,项目名称:hedge,代码行数:57,代码来源:tools.py
示例20: set_refsmiles
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
"""Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.
Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
"""
# Set up lingo and count matrices on device #{{{
if self.usePycudaArray:
# Set up using PyCUDA CUDAArray support
self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
self.gpu.tex2lr.set_array(self.gpu.rsmiles)
self.gpu.tex2cr.set_array(self.gpu.rcounts)
else:
# Manually handle setup
temprlmat = self._padded_array(refsmilesmat)
if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)
temprcmat = self._padded_array(refcountsmat)
self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)
descriptor = cuda.ArrayDescriptor()
descriptor.width = temprcmat.shape[1]
descriptor.height = temprcmat.shape[0]
descriptor.format = cuda.array_format.UNSIGNED_INT32
descriptor.num_channels = 1
self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
self.gpu.stream.synchronize()
del temprlmat
del temprcmat
#}}}
self.rlengths = reflengths
self.rshape = refsmilesmat.shape
self.nref = refsmilesmat.shape[0]
# Copy reference lengths to GPU
self.gpu.rl_gpu = cuda.to_device(reflengths)
# Allocate buffers for query set magnitudes
self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
if refmags is not None:
cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
else:
# Calculate query set magnitudes on GPU
magthreads = 256
self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
return
开发者ID:ihaque,项目名称:SIML,代码行数:56,代码来源:GPULingo.py
注:本文中的pycuda.driver.to_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论