• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

C++ divup函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了C++中divup函数的典型用法代码示例。如果您正苦于以下问题:C++ divup函数的具体用法?C++ divup怎么用?C++ divup使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了divup函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: reduce_first

void reduce_first(Param<To> out, CParam<Ti> in, bool change_nan,
                  double nanval) {
    uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
    threads_x      = std::min(threads_x, THREADS_PER_BLOCK);
    uint threads_y = THREADS_PER_BLOCK / threads_x;

    uint blocks_x = divup(in.dims[0], threads_x * REPEAT);
    uint blocks_y = divup(in.dims[1], threads_y);

    Param<To> tmp = out;
    uptr<To> tmp_alloc;
    if (blocks_x > 1) {
        tmp_alloc =
            memAlloc<To>(blocks_x * in.dims[1] * in.dims[2] * in.dims[3]);
        tmp.ptr = tmp_alloc.get();

        tmp.dims[0] = blocks_x;
        for (int k = 1; k < 4; k++) tmp.strides[k] *= blocks_x;
    }

    reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x,
                                      change_nan, nanval);

    if (blocks_x > 1) {
        // FIXME: Is there an alternative to the if condition?
        if (op == af_notzero_t) {
            reduce_first_launcher<To, To, af_add_t>(
                out, tmp, 1, blocks_y, threads_x, change_nan, nanval);
        } else {
            reduce_first_launcher<To, To, op>(out, tmp, 1, blocks_y, threads_x,
                                              change_nan, nanval);
        }
    }
}
开发者ID:9prady9,项目名称:arrayfire,代码行数:34,代码来源:reduce.hpp


示例2: morph

void morph(Param<T> out, CParam<T> in, int windLen)
{
    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);
    // launch batch * blk_x blocks along x dimension
    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);

    // calculate shared memory size
    int halo      = windLen/2;
    int padding   = 2*halo;
    int shrdLen   = kernel::THREADS_X + padding + 1; // +1 for to avoid bank conflicts
    int shrdSize  = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T);

    switch(windLen) {
        case  3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
    }

    POST_LAUNCH_CHECK();
}
开发者ID:munnybearz,项目名称:arrayfire,代码行数:30,代码来源:morph.hpp


示例3: meanshift

void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint iter)
{
    static dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);

    const int bCount   = (is_color ? 1 : in.dims[2]);
    const int channels = (is_color ? in.dims[2] : 1); // this has to be 3 for color images

    dim3 blocks(blk_x * bCount, blk_y * in.dims[3]);

    // clamp spatical and chromatic sigma's
    float space_     = std::min(11.5f, s_sigma);
    int radius  = std::max((int)(space_ * 1.5f), 1);
    int padding = 2*radius+1;
    const float cvar = c_sigma*c_sigma;
    size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T);

    if (is_color)
        CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size,
                         out, in, space_, radius, cvar, iter, blk_x, blk_y);
    else
        CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size,
                         out, in, space_, radius, cvar, iter, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
开发者ID:victorv,项目名称:arrayfire,代码行数:28,代码来源:meanshift.hpp


示例4: mxr_mplane_fill

static void mxr_mplane_fill(struct v4l2_plane_pix_format *planes,
	const struct mxr_format *fmt, u32 width, u32 height)
{
	int i;
	int y_size, cb_size;

	memset(planes, 0, sizeof(*planes) * fmt->num_subframes);
	for (i = 0; i < fmt->num_planes; ++i) {
		struct v4l2_plane_pix_format *plane = planes
			+ fmt->plane2subframe[i];
		const struct mxr_block *blk = &fmt->plane[i];
		u32 bl_width = divup(width, blk->width);
		u32 bl_height = divup(height, blk->height);
		u32 sizeimage = bl_width * bl_height * blk->size;
		u16 bytesperline = bl_width * blk->size / blk->height;

		if (fmt->fourcc == V4L2_PIX_FMT_NV12MT) {
			y_size = ALIGN(width, 128) * ALIGN(height, 64);
			cb_size = ALIGN(width, 128) * ALIGN(height / 2, 64);
			plane->sizeimage += i ? cb_size : y_size;
		} else {
			plane->sizeimage += sizeimage;
		}
		plane->bytesperline = max(plane->bytesperline, bytesperline);
	}
}
开发者ID:cm-3470,项目名称:android_kernel_samsung_degaslte,代码行数:26,代码来源:mixer_video.c


示例5: mean_dim

void mean_dim(Param out, Param in, Param inWeight, int dim)
{
    uint threads_y = std::min(THREADS_Y, nextpow2(in.info.dims[dim]));
    uint threads_x = THREADS_X;

    uint groups_all[] = {(uint)divup(in.info.dims[0], threads_x),
        (uint)in.info.dims[1],
        (uint)in.info.dims[2],
        (uint)in.info.dims[3]};

    groups_all[dim] = divup(in.info.dims[dim], threads_y * REPEAT);

    if (groups_all[dim] > 1) {
        dim4 d(4, out.info.dims);
        d[dim] = groups_all[dim];
        Array<To> tmpOut = createEmptyArray<To>(d);
        Array<Tw> tmpWeight = createEmptyArray<Tw>(d);
        mean_dim_launcher<Ti, Tw, To>(tmpOut, tmpWeight, in, inWeight, dim, threads_y, groups_all);

        Param owt;
        groups_all[dim] = 1;
        mean_dim_launcher<Ti, Tw, To>(out, owt, tmpOut, tmpWeight, dim, threads_y, groups_all);
    } else {
        Param tmpWeight;
        mean_dim_launcher<Ti, Tw, To>(out, tmpWeight, in, inWeight, dim, threads_y, groups_all);
    }

}
开发者ID:FilipeMaia,项目名称:arrayfire,代码行数:28,代码来源:mean.hpp


示例6: shift

        void shift(Param<T> out, CParam<T> in, const int *sdims)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
            blocks.z = divup(blocks.y, maxBlocksY);
            blocks.y = divup(blocks.y, blocks.z);

            int sdims_[4];
            // Need to do this because we are mapping output to input in the kernel
            for(int i = 0; i < 4; i++) {
                // sdims_[i] will always be positive and always [0, oDims[i]].
                // Negative shifts are converted to position by going the other way round
                sdims_[i] = -(sdims[i] % (int)out.dims[i]) + out.dims[i] * (sdims[i] > 0);
                assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]);
            }

            CUDA_LAUNCH((shift_kernel<T>), blocks, threads,
                    out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3],
                    blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
开发者ID:munnybearz,项目名称:arrayfire,代码行数:28,代码来源:shift.hpp


示例7: select

        void select(Param<T> out, CParam<char> cond, CParam<T> a, CParam<T> b, int ndims)
        {
            bool is_same = true;
            for (int i = 0; i < 4; i++) {
                is_same &= (a.dims[i] == b.dims[i]);
            }

            dim3 threads(DIMX, DIMY);

            if (ndims == 1) {
                threads.x *= threads.y;
                threads.y = 1;
            }

            int blk_x = divup(out.dims[0], threads.x);
            int blk_y = divup(out.dims[1], threads.y);


            dim3 blocks(blk_x * out.dims[2],
                        blk_y * out.dims[3]);

            if (is_same) {
                CUDA_LAUNCH((select_kernel<T, true>), blocks, threads,
                            out, cond, a, b, blk_x, blk_y);
            } else {
                CUDA_LAUNCH((select_kernel<T, false>), blocks, threads,
                            out, cond, a, b, blk_x, blk_y);
            }

        }
开发者ID:Brainiarc7,项目名称:arrayfire,代码行数:30,代码来源:select.hpp


示例8: transpose

void transpose(Param<T> out, CParam<T> in, const bool conjugate,
               const bool is32multiple) {
    static const std::string source(transpose_cuh, transpose_cuh_len);

    // clang-format off
    auto transpose = getKernel("cuda::transpose", source,
            {
              TemplateTypename<T>(),
              TemplateArg(conjugate),
              TemplateArg(is32multiple)
            },
            {
              DefineValue(TILE_DIM),
              DefineValue(THREADS_Y)
            }
            );
    // clang-format on

    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], TILE_DIM);
    int blk_y = divup(in.dims[1], TILE_DIM);
    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
    const int maxBlocksY =
        cuda::getDeviceProp(getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    EnqueueArgs qArgs(blocks, threads, getActiveStream());

    transpose(qArgs, out, in, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
开发者ID:9prady9,项目名称:arrayfire,代码行数:34,代码来源:transpose.hpp


示例9: scan_dim_by_key

    void scan_dim_by_key(Param<To> out, CParam<Ti> in, CParam<Tk> key, int dim, bool inclusive_scan)
    {
        uint threads_y = std::min(THREADS_Y, nextpow2(out.dims[dim]));
        uint threads_x = THREADS_X;

        uint blocks_all[] = {divup(out.dims[0], threads_x),
                             out.dims[1], out.dims[2], out.dims[3]};

        blocks_all[dim] = divup(out.dims[dim], threads_y * REPEAT);

        if (blocks_all[dim] == 1) {

            scan_dim_final_launcher<Ti, Tk, To, op>(out, in, key,
                                                    dim,
                                                    threads_y,
                                                    blocks_all,
                                                    true, inclusive_scan);

        } else {
            Param<To> tmp = out;
            Param<char> tmpflg;
            Param<int> tmpid;

            tmp.dims[dim] = blocks_all[dim];
            tmp.strides[0] = 1;
            for (int k = 1; k < 4; k++) tmp.strides[k] = tmp.strides[k - 1] * tmp.dims[k - 1];
            for (int k = 0; k < 4; k++) {
                tmpflg.strides[k] = tmp.strides[k];
                tmpid.strides[k] = tmp.strides[k];
                tmpflg.dims[k] = tmp.dims[k];
                tmpid.dims[k] = tmp.dims[k];
            }

            int tmp_elements = tmp.strides[3] * tmp.dims[3];
            tmp.ptr = memAlloc<To>(tmp_elements);
            tmpflg.ptr = memAlloc<char>(tmp_elements);
            tmpid.ptr = memAlloc<int>(tmp_elements);

            scan_dim_nonfinal_launcher<Ti, Tk, To, op>(out, tmp, tmpflg,
                                                       tmpid, in, key,
                                                       dim,
                                                       threads_y,
                                                       blocks_all,
                                                       inclusive_scan);

            int bdim = blocks_all[dim];
            blocks_all[dim] = 1;
            scan_dim_final_launcher<To, char, To,       op>(tmp, tmp, tmpflg,
                                                            dim,
                                                            threads_y,
                                                            blocks_all, false, true);

            blocks_all[dim] = bdim;
            bcast_dim_launcher<To, op>(out, tmp, tmpid, dim, threads_y, blocks_all);

            memFree(tmp.ptr);
            memFree(tmpflg.ptr);
            memFree(tmpid.ptr);
        }
    }
开发者ID:shehzan10,项目名称:arrayfire,代码行数:60,代码来源:scan_dim_by_key_impl.hpp


示例10: lookup

void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims)
{
    if (nDims==1) {
        const dim3 threads(THREADS, 1);
        /* find which dimension has non-zero # of elements */
        int vDim = 0;
        for (int i=0; i<4; i++) {
            if (in.dims[i]==1)
                vDim++;
            else
                break;
        }

        int blks = divup(out.dims[vDim], THREADS*THRD_LOAD);

        dim3 blocks(blks, 1);

        CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim);
    } else {
        const dim3 threads(THREADS_X, THREADS_Y);

        int blks_x = divup(out.dims[0], threads.x);
        int blks_y = divup(out.dims[1], threads.y);

        dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]);

        CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y);
    }

    POST_LAUNCH_CHECK();
}
开发者ID:shehzan10,项目名称:arrayfire,代码行数:31,代码来源:lookup.hpp


示例11: transform

        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            dim_type nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const dim_type ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const dim_type blocksXPerImage = blocks.x;
            if(nimages > TI) {
                dim_type tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                transform_kernel<T, true, method><<<blocks, threads>>>
                                (out, in, nimages, ntransforms, blocksXPerImage);
            } else {
开发者ID:EmergentOrder,项目名称:arrayfire,代码行数:27,代码来源:transform.hpp


示例12: transform

        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            int nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const int ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice,
                                          cuda::getStream(cuda::getActiveDeviceId())));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const int blocksXPerImage = blocks.x;
            if(nimages > TI) {
                int tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            } else {
                CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            }
            POST_LAUNCH_CHECK();
        }
开发者ID:hxiaox,项目名称:arrayfire,代码行数:33,代码来源:transform.hpp


示例13: scan_dim

    static void scan_dim(Param &out, const Param &in, int dim)
    {
        uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim]));
        uint threads_x = THREADS_X;

        uint groups_all[] = {divup((uint)out.info.dims[0], threads_x),
                              (uint)out.info.dims[1],
                              (uint)out.info.dims[2],
                              (uint)out.info.dims[3]};

        groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT);

        if (groups_all[dim] == 1) {

            scan_dim_launcher<Ti, To, op, inclusive_scan>(out, out, in,
                                          dim, true,
                                          threads_y,
                                          groups_all);
        } else {

            Param tmp = out;

            tmp.info.dims[dim] = groups_all[dim];
            tmp.info.strides[0] = 1;
            for (int k = 1; k < 4; k++) {
                tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1];
            }

            int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
            // FIXME: Do I need to free this ?
            tmp.data = bufferAlloc(tmp_elements * sizeof(To));

            scan_dim_launcher<Ti, To, op, inclusive_scan>(out, tmp, in,
                                          dim, false,
                                          threads_y,
                                          groups_all);

            int gdim = groups_all[dim];
            groups_all[dim] = 1;

            if (op == af_notzero_t) {
                scan_dim_launcher<To, To, af_add_t, true>(tmp, tmp, tmp,
                                                    dim, true,
                                                    threads_y,
                                                    groups_all);
            } else {
                scan_dim_launcher<To, To,       op, true>(tmp, tmp, tmp,
                                                    dim, true,
                                                    threads_y,
                                                    groups_all);
            }

            groups_all[dim] = gdim;
            bcast_dim_launcher<To, To, op, inclusive_scan>(out, tmp,
                                            dim, true,
                                            threads_y,
                                            groups_all);
            bufferFree(tmp.data);
        }
    }
开发者ID:FilipeMaia,项目名称:arrayfire,代码行数:60,代码来源:scan_dim.hpp


示例14: mxr_get_plane_size

unsigned long mxr_get_plane_size(const struct mxr_block *blk,
	unsigned int width, unsigned int height)
{
	unsigned int bl_width = divup(width, blk->width);
	unsigned int bl_height = divup(height, blk->height);

	return bl_width * bl_height * blk->size;
}
开发者ID:cm-3470,项目名称:android_kernel_samsung_degaslte,代码行数:8,代码来源:mixer_video.c


示例15: select_launcher

        void select_launcher(Param out, Param cond, Param a, Param b, int ndims)
        {
            static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
            static std::map<int, Program*>  selProgs;
            static std::map<int, Kernel*> selKernels;

            int device = getActiveDeviceId();

            std::call_once(compileFlags[device], [device] () {

                    std::ostringstream options;
                    options << " -D is_same=" << is_same
                            << " -D T=" << dtype_traits<T>::getName();

                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }

                    cl::Program prog;
                    buildProgram(prog, select_cl, select_cl_len, options.str());
                    selProgs[device] = new Program(prog);

                    selKernels[device] = new Kernel(*selProgs[device], "select_kernel");
                });


            int threads[] = {DIMX, DIMY};

            if (ndims == 1) {
                threads[0] *= threads[1];
                threads[1] = 1;
            }

            NDRange local(threads[0],
                          threads[1]);


            int groups_0 = divup(out.info.dims[0], local[0]);
            int groups_1 = divup(out.info.dims[1], local[1]);

            NDRange global(groups_0 * out.info.dims[2] * local[0],
                           groups_1 * out.info.dims[3] * local[1]);

            auto selectOp = make_kernel<Buffer, KParam,
                                        Buffer, KParam,
                                        Buffer, KParam,
                                        Buffer, KParam,
                                        int, int>(*selKernels[device]);

            selectOp(EnqueueArgs(getQueue(), global, local),
                     *out.data, out.info,
                     *cond.data, cond.info,
                     *a.data, a.info,
                     *b.data, b.info,
                     groups_0, groups_1);

        }
开发者ID:Brainiarc7,项目名称:arrayfire,代码行数:58,代码来源:select.hpp


示例16: convolve2

void convolve2(Param out, const Param signal, const Param filter)
{
    try {
        static std::once_flag  compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>   convProgs;
        static std::map<int, Kernel*>  convKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {
                const size_t C0_SIZE  = (THREADS_X+2*(fLen-1))* THREADS_Y;
                const size_t C1_SIZE  = (THREADS_Y+2*(fLen-1))* THREADS_X;

                size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE);

                    std::ostringstream options;
                    options << " -D T=" << dtype_traits<T>::getName()
                            << " -D accType="<< dtype_traits<accType>::getName()
                            << " -D CONV_DIM="<< conv_dim
                            << " -D EXPAND="<< expand
                            << " -D FLEN="<< fLen
                            << " -D LOCAL_MEM_SIZE="<<locSize;
                    if (std::is_same<T, double>::value ||
                        std::is_same<T, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }
                    Program prog;
                    buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str());
                    convProgs[device]   = new Program(prog);
                    convKernels[device] = new Kernel(*convProgs[device], "convolve");
                });

        auto convOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer,
                                  int, int>(*convKernels[device]);

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(out.info.dims[0], THREADS_X);
        int blk_y = divup(out.info.dims[1], THREADS_Y);

        NDRange global(blk_x*signal.info.dims[2]*THREADS_X,
                       blk_y*signal.info.dims[3]*THREADS_Y);

        cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType));
        // FIX ME: if the filter array is strided, direct might cause issues
        getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType));

        convOp(EnqueueArgs(getQueue(), global, local),
               *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y);

        bufferFree(mBuff);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
开发者ID:PierreBizouard,项目名称:arrayfire,代码行数:56,代码来源:convolve_separable.hpp


示例17: identity

    static void identity(Param<T> out)
    {
        dim3 threads(32, 8);
        int blocks_x = divup(out.dims[0], threads.x);
        int blocks_y = divup(out.dims[1], threads.y);
        dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);

        CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y);
        POST_LAUNCH_CHECK();
    }
开发者ID:rotorliu,项目名称:arrayfire,代码行数:10,代码来源:identity.hpp


示例18: scan_dim

    static void scan_dim(Param &out, const Param &in)
    {
        uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim]));
        uint threads_x = THREADS_X;

        uint groups_all[] = {divup((uint)out.info.dims[0], threads_x),
                             (uint)out.info.dims[1],
                             (uint)out.info.dims[2],
                             (uint)out.info.dims[3]};

        groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT);

        if (groups_all[dim] == 1) {

            scan_dim_fn<Ti, To, op, dim, true>(out, out, in,
                                               threads_y,
                                               groups_all);
        } else {

            Param tmp = out;

            tmp.info.dims[dim] = groups_all[dim];
            tmp.info.strides[0] = 1;
            for (int k = 1; k < 4; k++) {
                tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1];
            }

            dim_type tmp_elements = tmp.info.strides[3] * tmp.info.dims[3];
            // FIXME: Do I need to free this ?
            tmp.data = cl::Buffer(getContext(), CL_MEM_READ_WRITE, tmp_elements * sizeof(To));

            scan_dim_fn<Ti, To, op, dim, false>(out, tmp, in,
                                                threads_y,
                                                groups_all);

            int gdim = groups_all[dim];
            groups_all[dim] = 1;

            if (op == af_notzero_t) {
                scan_dim_fn<To, To, af_add_t, dim, true>(tmp, tmp, tmp,
                                                         threads_y,
                                                         groups_all);
            } else {
                scan_dim_fn<To, To,       op, dim, true>(tmp, tmp, tmp,
                                                         threads_y,
                                                         groups_all);
            }

            groups_all[dim] = gdim;
            bcast_dim_fn<To, To, op, dim, true>(out, tmp,
                                                threads_y,
                                                groups_all);

        }
    }
开发者ID:EasonYi,项目名称:arrayfire,代码行数:55,代码来源:scan_dim.hpp


示例19: matchTemplate

void matchTemplate(Param out, const Param srch, const Param tmplt)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*>  mtProgs;
        static std::map<int, Kernel*> mtKernels;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D inType="  << dtype_traits<inType>::getName()
                        << " -D outType=" << dtype_traits<outType>::getName()
                        << " -D MATCH_T=" << mType
                        << " -D NEEDMEAN="<< needMean
                        << " -D AF_SAD="  << AF_SAD
                        << " -D AF_ZSAD=" << AF_ZSAD
                        << " -D AF_LSAD=" << AF_LSAD
                        << " -D AF_SSD="  << AF_SSD
                        << " -D AF_ZSSD=" << AF_ZSSD
                        << " -D AF_LSSD=" << AF_LSSD
                        << " -D AF_NCC="  << AF_NCC
                        << " -D AF_ZNCC=" << AF_ZNCC
                        << " -D AF_SHD="  << AF_SHD;
                if (std::is_same<outType, double>::value) {
                    options << " -D USE_DOUBLE";
                }
                Program prog;
                buildProgram(prog, matchTemplate_cl, matchTemplate_cl_len, options.str());
                mtProgs[device]   = new Program(prog);
                mtKernels[device] = new Kernel(*mtProgs[device], "matchTemplate");
            });

        NDRange local(THREADS_X, THREADS_Y);

        int blk_x = divup(srch.info.dims[0], THREADS_X);
        int blk_y = divup(srch.info.dims[1], THREADS_Y);

        NDRange global(blk_x * srch.info.dims[2] * THREADS_X, blk_y * srch.info.dims[3] * THREADS_Y);

        auto matchImgOp = make_kernel<Buffer, KParam,
                                       Buffer, KParam,
                                       Buffer, KParam,
                                       int, int> (*mtKernels[device]);

        matchImgOp(EnqueueArgs(getQueue(), global, local),
                    *out.data, out.info, *srch.data, srch.info, *tmplt.data, tmplt.info, blk_x, blk_y);

        CL_DEBUG_FINISH(getQueue());
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
开发者ID:Brainiarc7,项目名称:arrayfire,代码行数:55,代码来源:match_template.hpp


示例20: memcopy

    void memcopy(cl::Buffer out, const dim_t *ostrides,
                 const cl::Buffer in, const dim_t *idims,
                 const dim_t *istrides, int offset, uint ndims)
    {
        try {
            static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
            static std::map<int, Program*>    cpyProgs;
            static std::map<int, Kernel*>   cpyKernels;

            int device = getActiveDeviceId();

            std::call_once(compileFlags[device], [&]() {
                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName();
                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }
                Program prog;
                buildProgram(prog, memcopy_cl, memcopy_cl_len, options.str());
                cpyProgs[device]   = new Program(prog);
                cpyKernels[device] = new Kernel(*cpyProgs[device], "memcopy_kernel");
            });

            dims_t _ostrides = {{ostrides[0], ostrides[1], ostrides[2], ostrides[3]}};
            dims_t _istrides = {{istrides[0], istrides[1], istrides[2], istrides[3]}};
            dims_t _idims = {{idims[0], idims[1], idims[2], idims[3]}};

            size_t local_size[2] = {DIM0, DIM1};
            if (ndims == 1) {
                local_size[0] *= local_size[1];
                local_size[1]  = 1;
            }

            int groups_0 = divup(idims[0], local_size[0]);
            int groups_1 = divup(idims[1], local_size[1]);

            NDRange local(local_size[0], local_size[1]);
            NDRange global(groups_0 * idims[2] * local_size[0],
                           groups_1 * idims[3] * local_size[1]);

            auto memcopy_kernel = KernelFunctor< Buffer, dims_t,
                                               Buffer, dims_t,
                                               dims_t, int,
                                               int, int >(*cpyKernels[device]);

            memcopy_kernel(EnqueueArgs(getQueue(), global, local),
                out, _ostrides, in, _idims, _istrides, offset, groups_0, groups_1);
            CL_DEBUG_FINISH(getQueue());
        }
        catch (cl::Error err) {
            CL_TO_AF_ERROR(err);
            throw;
        }
    }
开发者ID:shehzan10,项目名称:arrayfire,代码行数:55,代码来源:memcopy.hpp



注:本文中的divup函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
C++ dixLookupPrivate函数代码示例发布时间:2022-05-30
下一篇:
C++ divide函数代码示例发布时间:2022-05-30
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap