本文整理汇总了C++中divup函数的典型用法代码示例。如果您正苦于以下问题:C++ divup函数的具体用法?C++ divup怎么用?C++ divup使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了divup函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: reduce_first
void reduce_first(Param<To> out, CParam<Ti> in, bool change_nan,
double nanval) {
uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
threads_x = std::min(threads_x, THREADS_PER_BLOCK);
uint threads_y = THREADS_PER_BLOCK / threads_x;
uint blocks_x = divup(in.dims[0], threads_x * REPEAT);
uint blocks_y = divup(in.dims[1], threads_y);
Param<To> tmp = out;
uptr<To> tmp_alloc;
if (blocks_x > 1) {
tmp_alloc =
memAlloc<To>(blocks_x * in.dims[1] * in.dims[2] * in.dims[3]);
tmp.ptr = tmp_alloc.get();
tmp.dims[0] = blocks_x;
for (int k = 1; k < 4; k++) tmp.strides[k] *= blocks_x;
}
reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x,
change_nan, nanval);
if (blocks_x > 1) {
// FIXME: Is there an alternative to the if condition?
if (op == af_notzero_t) {
reduce_first_launcher<To, To, af_add_t>(
out, tmp, 1, blocks_y, threads_x, change_nan, nanval);
} else {
reduce_first_launcher<To, To, op>(out, tmp, 1, blocks_y, threads_x,
change_nan, nanval);
}
}
}
开发者ID:9prady9,项目名称:arrayfire,代码行数:34,代码来源:reduce.hpp
示例2: morph
void morph(Param<T> out, CParam<T> in, int windLen)
{
dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);
int blk_x = divup(in.dims[0], THREADS_X);
int blk_y = divup(in.dims[1], THREADS_Y);
// launch batch * blk_x blocks along x dimension
dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
// calculate shared memory size
int halo = windLen/2;
int padding = 2*halo;
int shrdLen = kernel::THREADS_X + padding + 1; // +1 for to avoid bank conflicts
int shrdSize = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T);
switch(windLen) {
case 3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
}
POST_LAUNCH_CHECK();
}
开发者ID:munnybearz,项目名称:arrayfire,代码行数:30,代码来源:morph.hpp
示例3: meanshift
void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint iter)
{
static dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);
int blk_x = divup(in.dims[0], THREADS_X);
int blk_y = divup(in.dims[1], THREADS_Y);
const int bCount = (is_color ? 1 : in.dims[2]);
const int channels = (is_color ? in.dims[2] : 1); // this has to be 3 for color images
dim3 blocks(blk_x * bCount, blk_y * in.dims[3]);
// clamp spatical and chromatic sigma's
float space_ = std::min(11.5f, s_sigma);
int radius = std::max((int)(space_ * 1.5f), 1);
int padding = 2*radius+1;
const float cvar = c_sigma*c_sigma;
size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T);
if (is_color)
CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size,
out, in, space_, radius, cvar, iter, blk_x, blk_y);
else
CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size,
out, in, space_, radius, cvar, iter, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
开发者ID:victorv,项目名称:arrayfire,代码行数:28,代码来源:meanshift.hpp
示例4: mxr_mplane_fill
static void mxr_mplane_fill(struct v4l2_plane_pix_format *planes,
const struct mxr_format *fmt, u32 width, u32 height)
{
int i;
int y_size, cb_size;
memset(planes, 0, sizeof(*planes) * fmt->num_subframes);
for (i = 0; i < fmt->num_planes; ++i) {
struct v4l2_plane_pix_format *plane = planes
+ fmt->plane2subframe[i];
const struct mxr_block *blk = &fmt->plane[i];
u32 bl_width = divup(width, blk->width);
u32 bl_height = divup(height, blk->height);
u32 sizeimage = bl_width * bl_height * blk->size;
u16 bytesperline = bl_width * blk->size / blk->height;
if (fmt->fourcc == V4L2_PIX_FMT_NV12MT) {
y_size = ALIGN(width, 128) * ALIGN(height, 64);
cb_size = ALIGN(width, 128) * ALIGN(height / 2, 64);
plane->sizeimage += i ? cb_size : y_size;
} else {
plane->sizeimage += sizeimage;
}
plane->bytesperline = max(plane->bytesperline, bytesperline);
}
}
开发者ID:cm-3470,项目名称:android_kernel_samsung_degaslte,代码行数:26,代码来源:mixer_video.c
示例5: mean_dim
void mean_dim(Param out, Param in, Param inWeight, int dim)
{
uint threads_y = std::min(THREADS_Y, nextpow2(in.info.dims[dim]));
uint threads_x = THREADS_X;
uint groups_all[] = {(uint)divup(in.info.dims[0], threads_x),
(uint)in.info.dims[1],
(uint)in.info.dims[2],
(uint)in.info.dims[3]};
groups_all[dim] = divup(in.info.dims[dim], threads_y * REPEAT);
if (groups_all[dim] > 1) {
dim4 d(4, out.info.dims);
d[dim] = groups_all[dim];
Array<To> tmpOut = createEmptyArray<To>(d);
Array<Tw> tmpWeight = createEmptyArray<Tw>(d);
mean_dim_launcher<Ti, Tw, To>(tmpOut, tmpWeight, in, inWeight, dim, threads_y, groups_all);
Param owt;
groups_all[dim] = 1;
mean_dim_launcher<Ti, Tw, To>(out, owt, tmpOut, tmpWeight, dim, threads_y, groups_all);
} else {
Param tmpWeight;
mean_dim_launcher<Ti, Tw, To>(out, tmpWeight, in, inWeight, dim, threads_y, groups_all);
}
}
开发者ID:FilipeMaia,项目名称:arrayfire,代码行数:28,代码来源:mean.hpp
示例6: shift
void shift(Param<T> out, CParam<T> in, const int *sdims)
{
dim3 threads(TX, TY, 1);
int blocksPerMatX = divup(out.dims[0], TILEX);
int blocksPerMatY = divup(out.dims[1], TILEY);
dim3 blocks(blocksPerMatX * out.dims[2],
blocksPerMatY * out.dims[3],
1);
const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
blocks.z = divup(blocks.y, maxBlocksY);
blocks.y = divup(blocks.y, blocks.z);
int sdims_[4];
// Need to do this because we are mapping output to input in the kernel
for(int i = 0; i < 4; i++) {
// sdims_[i] will always be positive and always [0, oDims[i]].
// Negative shifts are converted to position by going the other way round
sdims_[i] = -(sdims[i] % (int)out.dims[i]) + out.dims[i] * (sdims[i] > 0);
assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]);
}
CUDA_LAUNCH((shift_kernel<T>), blocks, threads,
out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3],
blocksPerMatX, blocksPerMatY);
POST_LAUNCH_CHECK();
}
开发者ID:munnybearz,项目名称:arrayfire,代码行数:28,代码来源:shift.hpp
示例7: select
void select(Param<T> out, CParam<char> cond, CParam<T> a, CParam<T> b, int ndims)
{
bool is_same = true;
for (int i = 0; i < 4; i++) {
is_same &= (a.dims[i] == b.dims[i]);
}
dim3 threads(DIMX, DIMY);
if (ndims == 1) {
threads.x *= threads.y;
threads.y = 1;
}
int blk_x = divup(out.dims[0], threads.x);
int blk_y = divup(out.dims[1], threads.y);
dim3 blocks(blk_x * out.dims[2],
blk_y * out.dims[3]);
if (is_same) {
CUDA_LAUNCH((select_kernel<T, true>), blocks, threads,
out, cond, a, b, blk_x, blk_y);
} else {
CUDA_LAUNCH((select_kernel<T, false>), blocks, threads,
out, cond, a, b, blk_x, blk_y);
}
}
开发者ID:Brainiarc7,项目名称:arrayfire,代码行数:30,代码来源:select.hpp
示例8: transpose
void transpose(Param<T> out, CParam<T> in, const bool conjugate,
const bool is32multiple) {
static const std::string source(transpose_cuh, transpose_cuh_len);
// clang-format off
auto transpose = getKernel("cuda::transpose", source,
{
TemplateTypename<T>(),
TemplateArg(conjugate),
TemplateArg(is32multiple)
},
{
DefineValue(TILE_DIM),
DefineValue(THREADS_Y)
}
);
// clang-format on
dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);
int blk_x = divup(in.dims[0], TILE_DIM);
int blk_y = divup(in.dims[1], TILE_DIM);
dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
const int maxBlocksY =
cuda::getDeviceProp(getActiveDeviceId()).maxGridSize[1];
blocks.z = divup(blocks.y, maxBlocksY);
blocks.y = divup(blocks.y, blocks.z);
EnqueueArgs qArgs(blocks, threads, getActiveStream());
transpose(qArgs, out, in, blk_x, blk_y);
POST_LAUNCH_CHECK();
}
开发者ID:9prady9,项目名称:arrayfire,代码行数:34,代码来源:transpose.hpp
示例9: scan_dim_by_key
void scan_dim_by_key(Param<To> out, CParam<Ti> in, CParam<Tk> key, int dim, bool inclusive_scan)
{
uint threads_y = std::min(THREADS_Y, nextpow2(out.dims[dim]));
uint threads_x = THREADS_X;
uint blocks_all[] = {divup(out.dims[0], threads_x),
out.dims[1], out.dims[2], out.dims[3]};
blocks_all[dim] = divup(out.dims[dim], threads_y * REPEAT);
if (blocks_all[dim] == 1) {
scan_dim_final_launcher<Ti, Tk, To, op>(out, in, key,
dim,
threads_y,
blocks_all,
true, inclusive_scan);
} else {
Param<To> tmp = out;
Param<char> tmpflg;
Param<int> tmpid;
tmp.dims[dim] = blocks_all[dim];
tmp.strides[0] = 1;
for (int k = 1; k < 4; k++) tmp.strides[k] = tmp.strides[k - 1] * tmp.dims[k - 1];
for (int k = 0; k < 4; k++) {
tmpflg.strides[k] = tmp.strides[k];
tmpid.strides[k] = tmp.strides[k];
tmpflg.dims[k] = tmp.dims[k];
tmpid.dims[k] = tmp.dims[k];
}
int tmp_elements = tmp.strides[3] * tmp.dims[3];
tmp.ptr = memAlloc<To>(tmp_elements);
tmpflg.ptr = memAlloc<char>(tmp_elements);
tmpid.ptr = memAlloc<int>(tmp_elements);
scan_dim_nonfinal_launcher<Ti, Tk, To, op>(out, tmp, tmpflg,
tmpid, in, key,
dim,
threads_y,
blocks_all,
inclusive_scan);
int bdim = blocks_all[dim];
blocks_all[dim] = 1;
scan_dim_final_launcher<To, char, To, op>(tmp, tmp, tmpflg,
dim,
threads_y,
blocks_all, false, true);
blocks_all[dim] = bdim;
bcast_dim_launcher<To, op>(out, tmp, tmpid, dim, threads_y, blocks_all);
memFree(tmp.ptr);
memFree(tmpflg.ptr);
memFree(tmpid.ptr);
}
}
开发者ID:shehzan10,项目名称:arrayfire,代码行数:60,代码来源:scan_dim_by_key_impl.hpp
示例10: lookup
void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims)
{
if (nDims==1) {
const dim3 threads(THREADS, 1);
/* find which dimension has non-zero # of elements */
int vDim = 0;
for (int i=0; i<4; i++) {
if (in.dims[i]==1)
vDim++;
else
break;
}
int blks = divup(out.dims[vDim], THREADS*THRD_LOAD);
dim3 blocks(blks, 1);
CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim);
} else {
const dim3 threads(THREADS_X, THREADS_Y);
int blks_x = divup(out.dims[0], threads.x);
int blks_y = divup(out.dims[1], threads.y);
dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]);
CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y);
}
POST_LAUNCH_CHECK();
}
开发者ID:shehzan10,项目名称:arrayfire,代码行数:31,代码来源:lookup.hpp
示例11: transform
void transform(Param<T> out, CParam<T> in, CParam<float> tf,
const bool inverse)
{
dim_type nimages = in.dims[2];
// Multiplied in src/backend/transform.cpp
const dim_type ntransforms = out.dims[2] / in.dims[2];
// Copy transform to constant memory.
CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
cudaMemcpyDeviceToDevice));
dim3 threads(TX, TY, 1);
dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));
const dim_type blocksXPerImage = blocks.x;
if(nimages > TI) {
dim_type tile_images = divup(nimages, TI);
nimages = TI;
blocks.x = blocks.x * tile_images;
}
if (ntransforms > 1) { blocks.y *= ntransforms; }
if(inverse) {
transform_kernel<T, true, method><<<blocks, threads>>>
(out, in, nimages, ntransforms, blocksXPerImage);
} else {
开发者ID:EmergentOrder,项目名称:arrayfire,代码行数:27,代码来源:transform.hpp
示例12: transform
void transform(Param<T> out, CParam<T> in, CParam<float> tf,
const bool inverse)
{
int nimages = in.dims[2];
// Multiplied in src/backend/transform.cpp
const int ntransforms = out.dims[2] / in.dims[2];
// Copy transform to constant memory.
CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
cudaMemcpyDeviceToDevice,
cuda::getStream(cuda::getActiveDeviceId())));
dim3 threads(TX, TY, 1);
dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));
const int blocksXPerImage = blocks.x;
if(nimages > TI) {
int tile_images = divup(nimages, TI);
nimages = TI;
blocks.x = blocks.x * tile_images;
}
if (ntransforms > 1) { blocks.y *= ntransforms; }
if(inverse) {
CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
out, in, nimages, ntransforms, blocksXPerImage);
} else {
CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
out, in, nimages, ntransforms, blocksXPerImage);
}
POST_LAUNCH_CHECK();
}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:33,代码来源:transform.hpp
示例13: scan_dim
static void scan_dim(Param &out, const Param &in, int dim)
{
uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim]));
uint threads_x = THREADS_X;
uint groups_all[] = {divup((uint)out.info.dims[0], threads_x),
(uint)out.info.dims[1],
(uint)out.info.dims[2],
(uint)out.info.dims[3]};
groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT);
if (groups_all[dim] == 1) {
scan_dim_launcher<Ti, To, op, inclusive_scan>(out, out, in,
dim, true,
threads_y,
groups_all);
} else {
Param tmp = out;
tmp.info.dims[dim] = groups_all[dim];
tmp.info.strides[0] = 1;
for (int k = 1; k < 4; k++) {
tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1];
}
int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
// FIXME: Do I need to free this ?
tmp.data = bufferAlloc(tmp_elements * sizeof(To));
scan_dim_launcher<Ti, To, op, inclusive_scan>(out, tmp, in,
dim, false,
threads_y,
groups_all);
int gdim = groups_all[dim];
groups_all[dim] = 1;
if (op == af_notzero_t) {
scan_dim_launcher<To, To, af_add_t, true>(tmp, tmp, tmp,
dim, true,
threads_y,
groups_all);
} else {
scan_dim_launcher<To, To, op, true>(tmp, tmp, tmp,
dim, true,
threads_y,
groups_all);
}
groups_all[dim] = gdim;
bcast_dim_launcher<To, To, op, inclusive_scan>(out, tmp,
dim, true,
threads_y,
groups_all);
bufferFree(tmp.data);
}
}
开发者ID:FilipeMaia,项目名称:arrayfire,代码行数:60,代码来源:scan_dim.hpp
示例14: mxr_get_plane_size
unsigned long mxr_get_plane_size(const struct mxr_block *blk,
unsigned int width, unsigned int height)
{
unsigned int bl_width = divup(width, blk->width);
unsigned int bl_height = divup(height, blk->height);
return bl_width * bl_height * blk->size;
}
开发者ID:cm-3470,项目名称:android_kernel_samsung_degaslte,代码行数:8,代码来源:mixer_video.c
示例15: select_launcher
void select_launcher(Param out, Param cond, Param a, Param b, int ndims)
{
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> selProgs;
static std::map<int, Kernel*> selKernels;
int device = getActiveDeviceId();
std::call_once(compileFlags[device], [device] () {
std::ostringstream options;
options << " -D is_same=" << is_same
<< " -D T=" << dtype_traits<T>::getName();
if (std::is_same<T, double>::value ||
std::is_same<T, cdouble>::value) {
options << " -D USE_DOUBLE";
}
cl::Program prog;
buildProgram(prog, select_cl, select_cl_len, options.str());
selProgs[device] = new Program(prog);
selKernels[device] = new Kernel(*selProgs[device], "select_kernel");
});
int threads[] = {DIMX, DIMY};
if (ndims == 1) {
threads[0] *= threads[1];
threads[1] = 1;
}
NDRange local(threads[0],
threads[1]);
int groups_0 = divup(out.info.dims[0], local[0]);
int groups_1 = divup(out.info.dims[1], local[1]);
NDRange global(groups_0 * out.info.dims[2] * local[0],
groups_1 * out.info.dims[3] * local[1]);
auto selectOp = make_kernel<Buffer, KParam,
Buffer, KParam,
Buffer, KParam,
Buffer, KParam,
int, int>(*selKernels[device]);
selectOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info,
*cond.data, cond.info,
*a.data, a.info,
*b.data, b.info,
groups_0, groups_1);
}
开发者ID:Brainiarc7,项目名称:arrayfire,代码行数:58,代码来源:select.hpp
示例16: convolve2
void convolve2(Param out, const Param signal, const Param filter)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> convProgs;
static std::map<int, Kernel*> convKernels;
int device = getActiveDeviceId();
std::call_once( compileFlags[device], [device] () {
const size_t C0_SIZE = (THREADS_X+2*(fLen-1))* THREADS_Y;
const size_t C1_SIZE = (THREADS_Y+2*(fLen-1))* THREADS_X;
size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE);
std::ostringstream options;
options << " -D T=" << dtype_traits<T>::getName()
<< " -D accType="<< dtype_traits<accType>::getName()
<< " -D CONV_DIM="<< conv_dim
<< " -D EXPAND="<< expand
<< " -D FLEN="<< fLen
<< " -D LOCAL_MEM_SIZE="<<locSize;
if (std::is_same<T, double>::value ||
std::is_same<T, cdouble>::value) {
options << " -D USE_DOUBLE";
}
Program prog;
buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str());
convProgs[device] = new Program(prog);
convKernels[device] = new Kernel(*convProgs[device], "convolve");
});
auto convOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer,
int, int>(*convKernels[device]);
NDRange local(THREADS_X, THREADS_Y);
int blk_x = divup(out.info.dims[0], THREADS_X);
int blk_y = divup(out.info.dims[1], THREADS_Y);
NDRange global(blk_x*signal.info.dims[2]*THREADS_X,
blk_y*signal.info.dims[3]*THREADS_Y);
cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType));
// FIX ME: if the filter array is strided, direct might cause issues
getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType));
convOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y);
bufferFree(mBuff);
} catch (cl::Error err) {
CL_TO_AF_ERROR(err);
throw;
}
}
开发者ID:PierreBizouard,项目名称:arrayfire,代码行数:56,代码来源:convolve_separable.hpp
示例17: identity
static void identity(Param<T> out)
{
dim3 threads(32, 8);
int blocks_x = divup(out.dims[0], threads.x);
int blocks_y = divup(out.dims[1], threads.y);
dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);
CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y);
POST_LAUNCH_CHECK();
}
开发者ID:rotorliu,项目名称:arrayfire,代码行数:10,代码来源:identity.hpp
示例18: scan_dim
static void scan_dim(Param &out, const Param &in)
{
uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim]));
uint threads_x = THREADS_X;
uint groups_all[] = {divup((uint)out.info.dims[0], threads_x),
(uint)out.info.dims[1],
(uint)out.info.dims[2],
(uint)out.info.dims[3]};
groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT);
if (groups_all[dim] == 1) {
scan_dim_fn<Ti, To, op, dim, true>(out, out, in,
threads_y,
groups_all);
} else {
Param tmp = out;
tmp.info.dims[dim] = groups_all[dim];
tmp.info.strides[0] = 1;
for (int k = 1; k < 4; k++) {
tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1];
}
dim_type tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
// FIXME: Do I need to free this ?
tmp.data = cl::Buffer(getContext(), CL_MEM_READ_WRITE, tmp_elements * sizeof(To));
scan_dim_fn<Ti, To, op, dim, false>(out, tmp, in,
threads_y,
groups_all);
int gdim = groups_all[dim];
groups_all[dim] = 1;
if (op == af_notzero_t) {
scan_dim_fn<To, To, af_add_t, dim, true>(tmp, tmp, tmp,
threads_y,
groups_all);
} else {
scan_dim_fn<To, To, op, dim, true>(tmp, tmp, tmp,
threads_y,
groups_all);
}
groups_all[dim] = gdim;
bcast_dim_fn<To, To, op, dim, true>(out, tmp,
threads_y,
groups_all);
}
}
开发者ID:EasonYi,项目名称:arrayfire,代码行数:55,代码来源:scan_dim.hpp
示例19: matchTemplate
void matchTemplate(Param out, const Param srch, const Param tmplt)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> mtProgs;
static std::map<int, Kernel*> mtKernels;
int device = getActiveDeviceId();
std::call_once( compileFlags[device], [device] () {
std::ostringstream options;
options << " -D inType=" << dtype_traits<inType>::getName()
<< " -D outType=" << dtype_traits<outType>::getName()
<< " -D MATCH_T=" << mType
<< " -D NEEDMEAN="<< needMean
<< " -D AF_SAD=" << AF_SAD
<< " -D AF_ZSAD=" << AF_ZSAD
<< " -D AF_LSAD=" << AF_LSAD
<< " -D AF_SSD=" << AF_SSD
<< " -D AF_ZSSD=" << AF_ZSSD
<< " -D AF_LSSD=" << AF_LSSD
<< " -D AF_NCC=" << AF_NCC
<< " -D AF_ZNCC=" << AF_ZNCC
<< " -D AF_SHD=" << AF_SHD;
if (std::is_same<outType, double>::value) {
options << " -D USE_DOUBLE";
}
Program prog;
buildProgram(prog, matchTemplate_cl, matchTemplate_cl_len, options.str());
mtProgs[device] = new Program(prog);
mtKernels[device] = new Kernel(*mtProgs[device], "matchTemplate");
});
NDRange local(THREADS_X, THREADS_Y);
int blk_x = divup(srch.info.dims[0], THREADS_X);
int blk_y = divup(srch.info.dims[1], THREADS_Y);
NDRange global(blk_x * srch.info.dims[2] * THREADS_X, blk_y * srch.info.dims[3] * THREADS_Y);
auto matchImgOp = make_kernel<Buffer, KParam,
Buffer, KParam,
Buffer, KParam,
int, int> (*mtKernels[device]);
matchImgOp(EnqueueArgs(getQueue(), global, local),
*out.data, out.info, *srch.data, srch.info, *tmplt.data, tmplt.info, blk_x, blk_y);
CL_DEBUG_FINISH(getQueue());
} catch (cl::Error err) {
CL_TO_AF_ERROR(err);
throw;
}
}
开发者ID:Brainiarc7,项目名称:arrayfire,代码行数:55,代码来源:match_template.hpp
示例20: memcopy
void memcopy(cl::Buffer out, const dim_t *ostrides,
const cl::Buffer in, const dim_t *idims,
const dim_t *istrides, int offset, uint ndims)
{
try {
static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
static std::map<int, Program*> cpyProgs;
static std::map<int, Kernel*> cpyKernels;
int device = getActiveDeviceId();
std::call_once(compileFlags[device], [&]() {
std::ostringstream options;
options << " -D T=" << dtype_traits<T>::getName();
if (std::is_same<T, double>::value ||
std::is_same<T, cdouble>::value) {
options << " -D USE_DOUBLE";
}
Program prog;
buildProgram(prog, memcopy_cl, memcopy_cl_len, options.str());
cpyProgs[device] = new Program(prog);
cpyKernels[device] = new Kernel(*cpyProgs[device], "memcopy_kernel");
});
dims_t _ostrides = {{ostrides[0], ostrides[1], ostrides[2], ostrides[3]}};
dims_t _istrides = {{istrides[0], istrides[1], istrides[2], istrides[3]}};
dims_t _idims = {{idims[0], idims[1], idims[2], idims[3]}};
size_t local_size[2] = {DIM0, DIM1};
if (ndims == 1) {
local_size[0] *= local_size[1];
local_size[1] = 1;
}
int groups_0 = divup(idims[0], local_size[0]);
int groups_1 = divup(idims[1], local_size[1]);
NDRange local(local_size[0], local_size[1]);
NDRange global(groups_0 * idims[2] * local_size[0],
groups_1 * idims[3] * local_size[1]);
auto memcopy_kernel = KernelFunctor< Buffer, dims_t,
Buffer, dims_t,
dims_t, int,
int, int >(*cpyKernels[device]);
memcopy_kernel(EnqueueArgs(getQueue(), global, local),
out, _ostrides, in, _idims, _istrides, offset, groups_0, groups_1);
CL_DEBUG_FINISH(getQueue());
}
catch (cl::Error err) {
CL_TO_AF_ERROR(err);
throw;
}
}
开发者ID:shehzan10,项目名称:arrayfire,代码行数:55,代码来源:memcopy.hpp
注:本文中的divup函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论