本文整理汇总了Python中pycuda.driver.memcpy_dtoh函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_dtoh函数的具体用法?Python memcpy_dtoh怎么用?Python memcpy_dtoh使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了memcpy_dtoh函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: calcV1complex
def calcV1complex(self, stim, speed):
"""Compute V1 complex cell responses of a frame."""
# allocate stim on device
self._loadInput(stim)
# convolve the stimulus with separate V1 filters
self._calcV1linear()
# rectify linear response to get V1 simple cell firing rate
self._calcV1rect()
# spatial pooling to get V1 complex
self._calcV1blur()
# divisive normalization
self._calcV1normalize()
# steer filters in specified directions
self._calcV1direction(speed)
# get data from device
res = np.zeros(self.nrX*self.nrY*self.nrDirs).astype(np.float32)
cuda.memcpy_dtoh(res, self.d_respV1c)
return res
开发者ID:UCI-CARL,项目名称:MotionEnergy,代码行数:26,代码来源:motionenergy.py
示例2: scenario_inplace_padded_C2R
def scenario_inplace_padded_C2R(batch,tic,toc):
n = array([2*BENG_CHANNELS_],int32)
inembed = array([16*(BENG_CHANNELS//16+1)],int32)
onembed = array([2*inembed[0]],int32)
plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0],
onembed.ctypes.data, 1, onembed[0],
cufft.CUFFT_C2R, batch)
data_shape = (batch,inembed[0])
cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape)
cpu_data = cpu_data.astype(complex64)
gpu_data = cuda.mem_alloc(8*batch*inembed[0]) # complex64
cuda.memcpy_htod(gpu_data,cpu_data)
tic.record()
cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data))
toc.record()
toc.synchronize()
cpu_result = np.empty(batch*onembed[0],dtype=np.float32)
cuda.memcpy_dtoh(cpu_result,gpu_data)
cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_)
result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1)
print 'Batched in-place scenario'
print 'test passed:',np.allclose(cpu_result,result)
print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
开发者ID:sma-wideband,项目名称:sdbe,代码行数:27,代码来源:fft_test.py
示例3: runTest
def runTest(self):
nx, ny, nz, str_f, pt0, pt1, is_array = self.args
slice_xyz = common.slices_two_points(pt0, pt1)
# generate random source
if is_array:
shape = common.shape_two_points(pt0, pt1)
value = np.random.rand(*shape).astype(np.float32)
else:
value = np.random.ranf()
# instance
fields = Fields(0, nx, ny, nz, '', 'single')
tfunc = lambda tstep: np.sin(0.03*tstep)
incident = IncidentDirect(fields, str_f, pt0, pt1, tfunc, value)
# host allocations
eh = np.zeros(fields.ns_pitch, dtype=fields.dtype)
# verify
eh[slice_xyz] = fields.dtype(value) * fields.dtype(tfunc(1))
fields.update_e()
fields.update_h()
copy_eh_buf = fields.get_buf(str_f)
copy_eh = np.zeros_like(eh)
cuda.memcpy_dtoh(copy_eh, copy_eh_buf)
original = eh[slice_xyz]
copy = copy_eh[slice_xyz]
norm = np.linalg.norm(original - copy)
self.assertEqual(norm, 0, '%s, %g' % (self.args, norm))
fields.context_pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:35,代码来源:test_incident_direct.py
示例4: calculate
def calculate (self, data, f_high, f_bins):
import pycuda.driver as driver
import pycuda.compiler as compiler
import pycuda.autoinit
log = logging.getLogger("astroplpython.function.signal")
log.debug("CULSP.calculate() called")
log.debug("Orig Data:"+str(data))
log.debug(" TODO: Calculate blocksize")
log.debug("set up GPU, allocate memory for working")
a_gpu = driver.mem_alloc(data.size * data.dtype.itemsize)
log.debug("push data into GPU memory")
driver.memcpy_htod(a_gpu, data)
log.debug("compile and run the culsp_kernel on data in the GPU")
culsp_func = compiler.SourceModule(self._kernelStr).get_function("culsp_kernel")
culsp_func (a_gpu, block=(4,4,1))
log.debug("pull data from GPU back into main memory")
result = np.empty_like(data)
driver.memcpy_dtoh(result, a_gpu)
log.debug("return result")
return result
开发者ID:brianthomas,项目名称:astroplpython,代码行数:29,代码来源:LSPeriodogram.py
示例5: poisson_parallel
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
# create Cheetah template and fill in variables for Poisson kernal
template = Template(poisson_blending_source)
template.BLOCK_DIM_X = b_size[0]
template.BLOCK_DIM_Y = b_size[1]
template.WIDTH = dest_im.shape[1]
template.HEIGHT = dest_im.shape[0]
template.RGB = RGB
template.NEIGHBORS = neighbors
# compile the CUDA kernel
poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")
# alloc memory in GPU
out_image = np.array(dest_im, dtype =np.uint8)
d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
cu.memcpy_htod(d_source, source_im)
cu.memcpy_htod(d_destination, dest_im)
cu.memcpy_htod(d_buffer, interior_buffer)
# calls CUDA for Poisson Blending n # of times
for i in range(n):
poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)
# retrieves the final output image and returns
cu.memcpy_dtoh(out_image, d_destination)
return out_image
开发者ID:JMTing,项目名称:cs205,代码行数:27,代码来源:parallel_poisson.py
示例6: diffuse_pycuda
def diffuse_pycuda(u):
nx,ny = np.int32(u.shape)
alpha = np.float32(0.645)
dx = np.float32(3.5/(nx-1))
dy = np.float32(3.5/(ny-1))
dt = np.float32(1e-05)
time = np.float32(0.4)
nt = np.int32(np.ceil(time/dt))
# print nt
u[0,:]=200
u[:,0]=200
u = u.astype(np.float32)
u_prev = u.copy()
u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
cuda.memcpy_htod(u_d, u)
cuda.memcpy_htod(u_prev_d, u_prev)
BLOCKSIZE = 16
gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
blockSize = (BLOCKSIZE,BLOCKSIZE,1)
for t in range(nt+1):
copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
cuda.memcpy_dtoh(u, u_d)
return u
开发者ID:htapia,项目名称:lania.pd,代码行数:34,代码来源:diffuse.py
示例7: fromGPU
def fromGPU(self, shared_mem, buff_dtype=np.float32 ):
buff = np.frombuffer(shared_mem.get_obj(), dtype=buff_dtype)
buff = buff[:self.buffer_nnets*self.buffer_nsamples]
buff = buff.reshape( (self.buffer_nnets, self.buffer_nsamples) )
cuda.memcpy_dtoh(buff, self.gpu_data)
return buff
开发者ID:JohnCEarls,项目名称:GPUDirac,代码行数:7,代码来源:data.py
示例8: test_constant_memory
def test_constant_memory(self):
# contributed by Andrew Wagner
module = SourceModule("""
__constant__ float const_array[32];
__global__ void copy_constant_into_global(float* global_result_array)
{
global_result_array[threadIdx.x] = const_array[threadIdx.x];
}
""")
copy_constant_into_global = module.get_function("copy_constant_into_global")
const_array, _ = module.get_global('const_array')
host_array = np.random.randint(0,255,(32,)).astype(np.float32)
global_result_array = drv.mem_alloc_like(host_array)
drv.memcpy_htod(const_array, host_array)
copy_constant_into_global(
global_result_array,
grid=(1, 1), block=(32, 1, 1))
host_result_array = np.zeros_like(host_array)
drv.memcpy_dtoh(host_result_array, global_result_array)
assert (host_result_array == host_array).all
开发者ID:davidweichiang,项目名称:pycuda,代码行数:28,代码来源:test_driver.py
示例9: calc_bandwidth_d2h
def calc_bandwidth_d2h( s ):
t1 = datetime.now()
cuda.memcpy_dtoh( s.a, s.dev_a )
dt = datetime.now() - t1
dt_float = dt.seconds + dt.microseconds*1e-6
return s.nbytes/dt_float/gbytes
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:7,代码来源:150-gpus-mpi-range-h5-seperate.py
示例10: fromSourceFile
def fromSourceFile():
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
#random data
np.random.seed(1)
a = np.random.randn(4,4)
a = a.astype(np.float32)
#read code and get function
mod = SourceModule(open('simple.cu').read())
func = mod.get_function("doublify")
#allocate memory on the GPU
a_gpu = cuda.mem_alloc(a.nbytes)
#transfer to the GPU memory
cuda.memcpy_htod(a_gpu, a)
#execute
func(a_gpu, block=(4,4,1))
#collect results
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print a_doubled
print a_doubled / (a*2)
开发者ID:eddienko,项目名称:EuclidVisibleInstrument,代码行数:30,代码来源:cudaTests.py
示例11: test_prepared_invocation
def test_prepared_invocation(self):
a = np.random.randn(4,4).astype(np.float32)
a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize)
drv.memcpy_htod(a_gpu, a)
mod = SourceModule("""
__global__ void doublify(float *a)
{
int idx = threadIdx.x + threadIdx.y*blockDim.x;
a[idx] *= 2;
}
""")
func = mod.get_function("doublify")
func.prepare("P")
func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20)
a_doubled = np.empty_like(a)
drv.memcpy_dtoh(a_doubled, a_gpu)
print (a)
print (a_doubled)
assert la.norm(a_doubled-2*a) == 0
# now with offsets
func.prepare("P")
a_quadrupled = np.empty_like(a)
func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize)
drv.memcpy_dtoh(a_quadrupled, a_gpu)
assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:29,代码来源:test_driver.py
示例12: loop
def loop(iterations):
ts = 0
while(ts<iterations):
' To avoid overwrites a temporary copy is made of F '
T[:] = F
cuda.memcpy_htod(T_gpu, T)
' Propagate '
prop(F_gpu, T_gpu,
block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
' Calculate density and get bounceback from obstacle nodes '
density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu,
block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
' Calculate equilibrium '
eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu,
U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1),
grid=(gridDimX,gridDimY))
' Transfer bounceback to obstacle nodes '
bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu,
block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY))
' Copy F to host for copy to T in beginning of loop '
cuda.memcpy_dtoh(F, F_gpu)
ts += 1
开发者ID:hohiroki,项目名称:Lattice-Boltzmann,代码行数:28,代码来源:lbm2dcu.py
示例13: cuda_crossOver
def cuda_crossOver(sola, solb):
""" """
sol_len = len(sola);
a_gpu = cuda.mem_alloc(sola.nbytes);
b_gpu = cuda.mem_alloc(solb.nbytes);
cuda.memcpy_htod(a_gpu, sola);
cuda.memcpy_htod(b_gpu, solb);
func = mod.get_function("crossOver");
func(a_gpu,b_gpu, block=(sol_len,1,1));
a_new = numpy.empty_like(sola);
b_new = numpy.empty_like(solb);
cuda.memcpy_dtoh(a_new, a_gpu);
cuda.memcpy_dtoh(b_new, b_gpu);
if debug == True:
print "a:", a;
print "b:",b;
print "new a:",a_new;
print "new b:",b_new;
return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py
示例14: _debug_print
def _debug_print( self ) :
cuda_driver.memcpy_dtoh( self.f , self.df1 )
np.set_printoptions( 3 , 10000 , linewidth = 200 , suppress = True )
print '#'*80
print self.f
开发者ID:jkotur,项目名称:particles,代码行数:7,代码来源:lbm.py
示例15: convolution_cuda
def convolution_cuda(sourceImage, filterx, filtery):
# Perform separable convolution on sourceImage using CUDA.
# Operates on floating point images with row-major storage.
destImage = sourceImage.copy()
assert sourceImage.dtype == 'float32', 'source image must be float32'
(imageHeight, imageWidth) = sourceImage.shape
assert filterx.shape == filtery.shape == (KERNEL_W, ) , 'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
filterx = numpy.float32(filterx)
filtery = numpy.float32(filtery)
DATA_W = iAlignUp(imageWidth, 16)
DATA_H = imageHeight
BYTES_PER_WORD = 4 # 4 for float32
DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
# Prepare device arrays
destImage_gpu = cuda.mem_alloc_like(destImage)
sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
cuda.memcpy_htod(sourceImage_gpu, sourceImage)
cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into constant memory via a symbol defined in the kernel
cuda.memcpy_htod(d_Kernel_columns, filtery)
# Call the kernels for convolution in each direction.
blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
threadBlockColumns = (COLUMN_TILE_W, 8, 1)
DATA_H = numpy.int32(DATA_H)
DATA_W = numpy.int32(DATA_W)
convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=[int(e) for e in blockGridRows], block=[int(e) for e in threadBlockRows])
convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=[int(e) for e in blockGridColumns], block=[int(e) for e in threadBlockColumns])
# Pull the data back from the GPU.
cuda.memcpy_dtoh(destImage, destImage_gpu)
return destImage
开发者ID:eddienko,项目名称:EuclidVisibleInstrument,代码行数:34,代码来源:Convolution.py
示例16: interior_buffer
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
# create Cheetah template and fill in variables for mask kernel
mask_template = Template(mask_source)
mask_template.BLOCK_DIM_X = b_size[0]
mask_template.BLOCK_DIM_Y = b_size[1]
mask_template.WIDTH = dest_im.shape[1]
mask_template.HEIGHT = dest_im.shape[0]
mask_template.RGB = RGB
mask_template.NEIGHBORS = neighbors
# compile the CUDA kernel
mask_kernel = cuda_compile(mask_template, "mask_kernel")
# alloc memory to GPU
d_source = cu.mem_alloc(source_im.nbytes)
cu.memcpy_htod(d_source, source_im)
# sends to GPU filter out interior points in the mask
mask_kernel(d_source, block=b_size, grid=g_size)
# retrieves interior point buffer from GPU
inner_buffer = np.array(dest_im, dtype =np.uint8)
cu.memcpy_dtoh(inner_buffer, d_source)
# returns the interior buffer
return inner_buffer
开发者ID:JMTing,项目名称:cs205,代码行数:26,代码来源:parallel_poisson.py
示例17: calc_blob_blob_forces_pycuda
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
# Determine number of threads and blocks for the GPU
number_of_blobs = np.int32(len(r_vectors))
threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)
# Get parameters from arguments
L = kwargs.get('periodic_length')
eps = kwargs.get('repulsion_strength')
b = kwargs.get('debye_length')
blob_radius = kwargs.get('blob_radius')
# Reshape arrays
x = np.reshape(r_vectors, number_of_blobs * 3)
f = np.empty_like(x)
# Allocate GPU memory
x_gpu = cuda.mem_alloc(x.nbytes)
f_gpu = cuda.mem_alloc(f.nbytes)
# Copy data to the GPU (host to device)
cuda.memcpy_htod(x_gpu, x)
# Get blob-blob force function
force = mod.get_function("calc_blob_blob_force")
# Compute mobility force product
force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1))
# Copy data from GPU to CPU (device to host)
cuda.memcpy_dtoh(f, f_gpu)
return np.reshape(f, (number_of_blobs, 3))
开发者ID:stochasticHydroTools,项目名称:RigidMultiblobsWall,代码行数:33,代码来源:forces_pycuda_user_defined.py
示例18: test_compare_order
def test_compare_order():
'''
compare_order between C(row-major), F(column-major)
'''
compare_order = mod_cu.get_function('compare_order')
nx, ny = 3, 4
f_1d = np.arange(nx*ny, dtype='f8')
f_2d_C = f_1d.reshape((nx,ny), order='C')
f_2d_F = f_1d.reshape((nx,ny), order='F')
print ''
print 'f_1d_C\n\n', f_1d
print 'f_2d_C\n', f_2d_C
print 'f_2d_F\n', f_2d_F
print ''
print 'after cuda'
ret_f_1d = np.zeros_like(f_1d)
f_1d_gpu = cuda.mem_alloc_like(f_1d)
f_2d_C_gpu = cuda.to_device(f_2d_C)
compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
print 'f_1d from f_2d_C\n', ret_f_1d
f_2d_F_gpu = cuda.to_device(f_2d_F)
compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
print 'f_1d from f_2d_F\n', ret_f_1d
开发者ID:wbkifun,项目名称:my_stuff,代码行数:31,代码来源:compare_order_C_F.py
示例19: calc_psd
def calc_psd(self,bitloads,xtalk):
#Number of expected permutations
Ncombinations=self.K
#Check if this is getting hairy and assign grid/block dimensions
(warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)
#How many individual lk's
memdim=blockCount*threadCount
threadshare_grid=(blockCount,1)
threadshare_block=(threadCount,1,1)
#Memory (We get away with the NCombinations because calpsd checks against it)
d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
h_p=np.zeros((self.K,self.N)).astype(self.type)
cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
#Go solve
#__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){
self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
cuda.Context.synchronize()
cuda.memcpy_dtoh(h_p,d_p)
d_a.free()
d_bitload.free()
d_XTG.free()
d_p.free()
return h_p.astype(np.float64)
开发者ID:andrewbolster,项目名称:multiuserDSM,代码行数:32,代码来源:gpu.py
示例20: test_pycuda
def test_pycuda(self):
"""
Test pycuda installation with small example.
:return:
:rtype:
"""
try:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
a = np.random.randn(4, 4)
print(a)
a= a.astype(np.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mod = SourceModule(
"""
__global__ void doublify(float *a)
{
int idx = threadIdx.x + threadIdx.y*4;
a[idx] *= 2;
}
"""
)
func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
#print(a_doubled)
#print(a)
except Exception:
self.fail('Still not working')
开发者ID:SpikingNeurons,项目名称:WriterIdentification,代码行数:33,代码来源:unit_tests.py
注:本文中的pycuda.driver.memcpy_dtoh函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论