本文整理汇总了C++中op_timing_realloc函数的典型用法代码示例。如果您正苦于以下问题:C++ op_timing_realloc函数的具体用法?C++ op_timing_realloc怎么用?C++ op_timing_realloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了op_timing_realloc函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: op_par_loop_res
// host stub function
void op_par_loop_res(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3){
int nargs = 4;
op_arg args[4];
args[0] = arg0;
args[1] = arg1;
args[2] = arg2;
args[3] = arg3;
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timing_realloc(0);
op_timers_core(&cpu_t1, &wall_t1);
if (OP_diags>2) {
printf(" kernel routine with indirection: res\n");
}
int set_size = op_mpi_halo_exchanges(set, nargs, args);
if (set->size >0) {
for ( int n=0; n<set_size; n++ ){
if (n==set->core_size) {
op_mpi_wait_all(nargs, args);
}
int map1idx = arg1.map_data[n * arg1.map->dim + 1];
int map2idx = arg1.map_data[n * arg1.map->dim + 0];
res(
&((double*)arg0.data)[1 * n],
&((double*)arg1.data)[1 * map1idx],
&((double*)arg2.data)[1 * map2idx],
(double*)arg3.data);
}
}
if (set_size == 0 || set_size == set->core_size) {
op_mpi_wait_all(nargs, args);
}
// combine reduction data
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
OP_kernels[0].transfer += (float)set->size * arg1.size;
OP_kernels[0].transfer += (float)set->size * arg2.size * 2.0f;
OP_kernels[0].transfer += (float)set->size * arg0.size;
OP_kernels[0].transfer += (float)set->size * arg3.size;
OP_kernels[0].transfer += (float)set->size * arg1.map->dim * 4.0f;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:61,代码来源:res_seqkernel.cpp
示例2: save_soln_host
void save_soln_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2)
{
size_t blocksPerGrid;
size_t threadsPerBlock;
size_t totalThreadNumber;
size_t dynamicSharedMemorySize;
cl_int errorCode;
cl_event event;
cl_kernel kernelPointer;
int sharedMemoryOffset;
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers(&cpu_t1, &wall_t1);
blocksPerGrid = 200;
threadsPerBlock = threadsPerBlockSize_save_soln;
totalThreadNumber = threadsPerBlock * blocksPerGrid;
dynamicSharedMemorySize = 0;
dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4);
dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4);
sharedMemoryOffset = dynamicSharedMemorySize * OP_WARPSIZE;
dynamicSharedMemorySize = dynamicSharedMemorySize * threadsPerBlock;
kernelPointer = getKernel("save_soln_kernel");
errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d);
errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat2.data_d);
errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(int ),&sharedMemoryOffset);
errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(int ),&set -> size);
//errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(size_t ),&dynamicSharedMemorySize);
errorCode = errorCode | clSetKernelArg(kernelPointer,4,dynamicSharedMemorySize,NULL);
//printf("errorCode after 5: %d\n", errorCode);
assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments save_soln");
errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event);
assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel save_soln");
errorCode = clFinish(cqCommandQueue);
assert_m(errorCode == CL_SUCCESS,"Error completing device command queue");
#ifdef PROFILE
unsigned long tqueue, tsubmit, tstart, tend, telapsed;
ciErrNum = clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(tqueue), &tqueue, NULL );
ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(tsubmit), &tsubmit, NULL );
ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL );
ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_END, sizeof(tend), &tend, NULL );
assert_m( ciErrNum == CL_SUCCESS, "error getting profiling info" );
OP_kernels[0].queue_time += (tsubmit - tqueue);
OP_kernels[0].wait_time += (tstart - tsubmit);
OP_kernels[0].execution_time += (tend - tstart);
//printf("%20lu\n%20lu\n%20lu\n%20lu\n\n", tqueue, tsubmit, tstart, tend);
//printf("queue: %8.4f\nwait:%8.4f\nexec: %8.4f\n\n", OP_kernels[0].queue_time * 1.0e-9, OP_kernels[0].wait_time * 1.0e-9, OP_kernels[0].execution_time * 1.0e-9 );
#endif
// update kernel record
op_timers(&cpu_t2, &wall_t2);
op_timing_realloc(0);
OP_kernels[0].name = userSubroutine;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
OP_kernels[0].transfer += (float)set->size * opDat1.size;
OP_kernels[0].transfer += (float)set->size * opDat2.size;
}
开发者ID:roxanarusitoru,项目名称:OP2_Compiler_generated_code,代码行数:58,代码来源:save_soln_host.cl.cpp
示例3: op_par_loop_save_soln
void op_par_loop_save_soln(char const *name, op_set set,
op_arg arg0,
op_arg arg1 ){
int ninds = 0;
int nargs = 2;
op_arg args[2] = {arg0,arg1};
if (OP_diags>2) {
printf(" kernel routine w/o indirection: save_soln \n");
}
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers_core(&cpu_t1, &wall_t1);
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads( );
#else
int nthreads = 1;
#endif
// execute plan
#pragma omp parallel for
for (int thr=0; thr<nthreads; thr++) {
int start = (set->size* thr )/nthreads;
int finish = (set->size*(thr+1))/nthreads;
op_x86_save_soln( (double *) arg0.data,
(double *) arg1.data,
start, finish );
}
//set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
for(int i = 0; i<nargs; i++)
if(args[i].argtype == OP_ARG_DAT)
set_dirtybit(args[i]);
//performe any global operations
// - NONE
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
op_timing_realloc(0);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
OP_kernels[0].transfer += (double)set->size * arg0.size;
OP_kernels[0].transfer += (double)set->size * arg1.size;
}
开发者ID:ioz9,项目名称:OP2-Common,代码行数:56,代码来源:save_soln_mpi_kernel.cpp
示例4: op_par_loop_update
// host stub function
void op_par_loop_update(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3,
op_arg arg4){
int nargs = 5;
op_arg args[5];
args[0] = arg0;
args[1] = arg1;
args[2] = arg2;
args[3] = arg3;
args[4] = arg4;
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timing_realloc(1);
op_timers_core(&cpu_t1, &wall_t1);
if (OP_diags>2) {
printf(" kernel routine w/o indirection: update");
}
int set_size = op_mpi_halo_exchanges(set, nargs, args);
if (set->size >0) {
for ( int n=0; n<set_size; n++ ){
update(
&((float*)arg0.data)[1*n],
&((float*)arg1.data)[1*n],
&((float*)arg2.data)[1*n],
(float*)arg3.data,
(float*)arg4.data);
}
}
// combine reduction data
op_mpi_reduce_float(&arg3,(float*)arg3.data);
op_mpi_reduce_float(&arg4,(float*)arg4.data);
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
OP_kernels[1].name = name;
OP_kernels[1].count += 1;
OP_kernels[1].time += wall_t2 - wall_t1;
OP_kernels[1].transfer += (float)set->size * arg0.size;
OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f;
OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:55,代码来源:update_seqkernel.cpp
示例5: op_par_loop_save_soln_cpu
// host stub function
void op_par_loop_save_soln_cpu(char const *name, op_set set,
op_arg arg0,
op_arg arg1){
int nargs = 2;
op_arg args[2];
args[0] = arg0;
args[1] = arg1;
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timing_realloc(0);
op_timers_core(&cpu_t1, &wall_t1);
if (OP_diags>2) {
printf(" kernel routine w/o indirection: save_soln");
}
op_mpi_halo_exchanges(set, nargs, args);
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads();
#else
int nthreads = 1;
#endif
if (set->size >0) {
// execute plan
#pragma omp parallel for
for ( int thr=0; thr<nthreads; thr++ ){
int start = (set->size* thr)/nthreads;
int finish = (set->size*(thr+1))/nthreads;
for ( int n=start; n<finish; n++ ){
save_soln(
&((double*)arg0.data)[4*n],
&((double*)arg1.data)[4*n]);
}
}
}
// combine reduction data
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
OP_kernels[0].transfer += (float)set->size * arg0.size;
OP_kernels[0].transfer += (float)set->size * arg1.size;
}
开发者ID:pushkarjain1991,项目名称:OP2-Common,代码行数:55,代码来源:save_soln_cpu_kernel.cpp
示例6: op_par_loop_save_soln
void op_par_loop_save_soln(char const *name, op_set set,
op_arg arg0,
op_arg arg1 ) {
if (OP_diags>2) {
printf(" kernel routine w/o indirection: save_soln \n");
}
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers(&cpu_t1, &wall_t1);
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads( );
#else
int nthreads = 1;
#endif
// execute plan
#pragma omp parallel for
for (int thr=0; thr<nthreads; thr++) {
int start = (set->size* thr )/nthreads;
int finish = (set->size*(thr+1))/nthreads;
op_x86_save_soln( (float *) arg0.data,
(float *) arg1.data,
start, finish );
}
// update kernel record
op_timers(&cpu_t2, &wall_t2);
op_timing_realloc(0);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
OP_kernels[0].transfer += (float)set->size * arg0.size;
OP_kernels[0].transfer += (float)set->size * arg1.size;
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:43,代码来源:save_soln_kernel.cpp
示例7: op_par_loop_res_calc
void op_par_loop_res_calc(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3,
op_arg arg4,
op_arg arg5,
op_arg arg6,
op_arg arg7 ) {
int nargs = 8;
op_arg args[8];
args[0] = arg0;
args[1] = arg1;
args[2] = arg2;
args[3] = arg3;
args[4] = arg4;
args[5] = arg5;
args[6] = arg6;
args[7] = arg7;
int ninds = 4;
int inds[8] = {0,0,1,1,2,2,3,3};
if (OP_diags>2) {
printf(" kernel routine with indirection: res_calc\n");
}
// get plan
#ifdef OP_PART_SIZE_2
int part_size = OP_PART_SIZE_2;
#else
int part_size = OP_part_size;
#endif
int set_size = op_mpi_halo_exchanges(set, nargs, args);
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers_core(&cpu_t1, &wall_t1);
if (set->size >0) {
op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
// execute plan
int block_offset = 0;
for (int col=0; col < Plan->ncolors; col++) {
if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);
int nblocks = Plan->ncolblk[col];
#pragma omp parallel for
for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
op_x86_res_calc( blockIdx,
(double *)arg0.data,
(double *)arg2.data,
(double *)arg4.data,
(double *)arg6.data,
Plan->ind_map,
Plan->loc_map,
Plan->ind_sizes,
Plan->ind_offs,
block_offset,
Plan->blkmap,
Plan->offset,
Plan->nelems,
Plan->nthrcol,
Plan->thrcol,
set_size);
block_offset += nblocks;
}
op_timing_realloc(2);
OP_kernels[2].transfer += Plan->transfer;
OP_kernels[2].transfer2 += Plan->transfer2;
}
// combine reduction data
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
op_timing_realloc(2);
OP_kernels[2].name = name;
OP_kernels[2].count += 1;
OP_kernels[2].time += wall_t2 - wall_t1;
}
开发者ID:xyuan,项目名称:OP2-Common,代码行数:99,代码来源:res_calc_kernel.cpp
示例8: op_par_loop_res_calc
// host stub function
void op_par_loop_res_calc(char const *name, op_set set,
op_arg arg0,
op_arg arg4,
op_arg arg8,
op_arg arg9){
int nargs = 13;
op_arg args[13];
arg0.idx = 0;
args[0] = arg0;
for ( int v=1; v<4; v++ ){
args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
}
arg4.idx = 0;
args[4] = arg4;
for ( int v=1; v<4; v++ ){
args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
}
args[8] = arg8;
arg9.idx = 0;
args[9] = arg9;
for ( int v=1; v<4; v++ ){
args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC);
}
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timing_realloc(0);
op_timers_core(&cpu_t1, &wall_t1);
if (OP_diags>2) {
printf(" kernel routine with indirection: res_calc\n");
}
int set_size = op_mpi_halo_exchanges(set, nargs, args);
if (set->size >0) {
for ( int n=0; n<set_size; n++ ){
if (n==set->core_size) {
op_mpi_wait_all(nargs, args);
}
int map0idx = arg0.map_data[n * arg0.map->dim + 0];
int map1idx = arg0.map_data[n * arg0.map->dim + 1];
int map2idx = arg0.map_data[n * arg0.map->dim + 2];
int map3idx = arg0.map_data[n * arg0.map->dim + 3];
const double* arg0_vec[] = {
&((double*)arg0.data)[2 * map0idx],
&((double*)arg0.data)[2 * map1idx],
&((double*)arg0.data)[2 * map2idx],
&((double*)arg0.data)[2 * map3idx]};
const double* arg4_vec[] = {
&((double*)arg4.data)[1 * map0idx],
&((double*)arg4.data)[1 * map1idx],
&((double*)arg4.data)[1 * map2idx],
&((double*)arg4.data)[1 * map3idx]};
double* arg9_vec[] = {
&((double*)arg9.data)[1 * map0idx],
&((double*)arg9.data)[1 * map1idx],
&((double*)arg9.data)[1 * map2idx],
&((double*)arg9.data)[1 * map3idx]};
res_calc(
arg0_vec,
arg4_vec,
&((double*)arg8.data)[16 * n],
arg9_vec);
}
}
if (set_size == 0 || set_size == set->core_size) {
op_mpi_wait_all(nargs, args);
}
// combine reduction data
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
OP_kernels[0].transfer += (float)set->size * arg0.size;
OP_kernels[0].transfer += (float)set->size * arg4.size;
OP_kernels[0].transfer += (float)set->size * arg9.size * 2.0f;
OP_kernels[0].transfer += (float)set->size * arg8.size;
OP_kernels[0].transfer += (float)set->size * arg0.map->dim * 4.0f;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:93,代码来源:res_calc_seqkernel.cpp
示例9: op_par_loop_res_calc
void op_par_loop_res_calc(char const *name, op_set set,
op_arg arg0,
op_arg arg4,
op_arg arg8,
op_arg arg9 ){
int nargs = 13;
op_arg args[13];
arg0.idx = 0;
args[0] = arg0;
for (int v = 1; v < 4; v++) {
args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
}
arg4.idx = 0;
args[4] = arg4;
for (int v = 1; v < 4; v++) {
args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
}
args[8] = arg8;
arg9.idx = 0;
args[9] = arg9;
for (int v = 1; v < 4; v++) {
args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC);
}
int ninds = 3;
int inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2};
if (OP_diags>2) {
printf(" kernel routine with indirection: res_calc\n");
}
// get plan
#ifdef OP_PART_SIZE_0
int part_size = OP_PART_SIZE_0;
#else
int part_size = OP_part_size;
#endif
int set_size = op_mpi_halo_exchanges(set, nargs, args);
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers_core(&cpu_t1, &wall_t1);
if (set->size >0) {
op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
// execute plan
int block_offset = 0;
for (int col=0; col < Plan->ncolors; col++) {
if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);
int nblocks = Plan->ncolblk[col];
#pragma omp parallel for
for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
op_x86_res_calc( blockIdx,
(double *)arg0.data,
(double *)arg4.data,
(double *)arg9.data,
Plan->ind_map,
Plan->loc_map,
(double *)arg8.data,
Plan->ind_sizes,
Plan->ind_offs,
block_offset,
Plan->blkmap,
Plan->offset,
Plan->nelems,
Plan->nthrcol,
Plan->thrcol,
set_size);
block_offset += nblocks;
}
op_timing_realloc(0);
OP_kernels[0].transfer += Plan->transfer;
OP_kernels[0].transfer2 += Plan->transfer2;
}
// combine reduction data
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
op_timing_realloc(0);
OP_kernels[0].name = name;
//.........这里部分代码省略.........
开发者ID:xyuan,项目名称:OP2-Common,代码行数:101,代码来源:res_calc_kernel.cpp
示例10: op_par_loop_res_calc
// host stub function
void op_par_loop_res_calc(char const *name, op_set set,
op_arg arg0,
op_arg arg4,
op_arg arg8,
op_arg arg9,
op_arg arg13){
int nargs = 17;
op_arg args[17];
arg0.idx = 0;
args[0] = arg0;
for ( int v=1; v<4; v++ ){
args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
}
arg4.idx = 0;
args[4] = arg4;
for ( int v=1; v<4; v++ ){
args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
}
args[8] = arg8;
arg9.idx = 0;
args[9] = arg9;
for ( int v=1; v<4; v++ ){
args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW);
}
arg13.idx = 0;
args[13] = arg13;
for ( int v=1; v<4; v++ ){
args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC);
}
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timing_realloc(0);
op_timers_core(&cpu_t1, &wall_t1);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
int ninds = 4;
int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3};
if (OP_diags>2) {
printf(" kernel routine with indirection: res_calc\n");
}
// get plan
int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
#ifdef OP_PART_SIZE_0
int part_size = OP_PART_SIZE_0;
#else
int part_size = OP_part_size;
#endif
#ifdef OP_BLOCK_SIZE_0
int nthread = OP_BLOCK_SIZE_0;
#else
int nthread = OP_block_size;
#endif
int ncolors = 0;
int set_size1 = set->size + set->exec_size;
if (set->size >0) {
if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) {
opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0);
opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST;
}
if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) {
direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8);
direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST;
}
//Set up typed device pointers for OpenMP
int *map0 = arg0.map_data_d;
int map0size = arg0.map->dim * set_size1;
double* data8 = (double*)arg8.data_d;
int dat8size = (arg8.opt?1:0) * getSetSizeFromOpArg(&arg8) * arg8.dat->dim;
double *data0 = (double *)arg0.data_d;
int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;
double *data4 = (double *)arg4.data_d;
int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim;
double *data9 = (double *)arg9.data_d;
int dat9size =
(arg9.opt ? 1 : 0) * getSetSizeFromOpArg(&arg9) * arg9.dat->dim;
double *data13 = (double *)arg13.data_d;
int dat13size =
(arg13.opt ? 1 : 0) * getSetSizeFromOpArg(&arg13) * arg13.dat->dim;
op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
ncolors = Plan->ncolors;
int *col_reord = Plan->col_reord;
//.........这里部分代码省略.........
开发者ID:OP2,项目名称:OP2-Common,代码行数:101,代码来源:res_calc_omp4kernel.cpp
示例11: op_par_loop_dotR
void op_par_loop_dotR(char const *name, op_set set,
op_arg arg0,
op_arg arg1 ){
double *arg1h = (double *)arg1.data;
int nargs = 2;
op_arg args[2];
args[0] = arg0;
args[1] = arg1;
if (OP_diags>2) {
printf(" kernel routine w/o indirection: dotR\n");
}
op_mpi_halo_exchanges(set, nargs, args);
// initialise timers
double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
op_timing_realloc(6);
OP_kernels[6].name = name;
OP_kernels[6].count += 1;
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads( );
#else
int nthreads = 1;
#endif
// allocate and initialise arrays for global reduction
double arg1_l[1+64*64];
for (int thr=0; thr<nthreads; thr++)
for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_double;
if (set->size >0) {
op_timers_core(&cpu_t1, &wall_t1);
// execute plan
#pragma omp parallel for
for (int thr=0; thr<nthreads; thr++) {
int start = (set->size* thr )/nthreads;
int finish = (set->size*(thr+1))/nthreads;
op_x86_dotR( (double *) arg0.data,
arg1_l + thr*64,
start, finish );
}
}
// combine reduction data
for (int thr=0; thr<nthreads; thr++)
for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];
op_mpi_reduce(&arg1,arg1h);
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
OP_kernels[6].time += wall_t2 - wall_t1;
OP_kernels[6].transfer += (float)set->size * arg0.size;
}
开发者ID:Benjamin-git,项目名称:OP2-Common,代码行数:72,代码来源:dotR_kernel.cpp
示例12: op_par_loop_adt_calc
void op_par_loop_adt_calc(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3,
op_arg arg4,
op_arg arg5 ){
int nargs = 6;
op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5};
int ninds = 1;
int inds[6] = {0,0,0,0,-1,-1};
int sent[6] = {0,0,0,0,0,0};
if(ninds > 0) //indirect loop
{
for(int i = 0; i<nargs; i++)
{
if(args[i].argtype == OP_ARG_DAT)
{
if (OP_diags==1) reset_halo(args[i]);
sent[0] = exchange_halo(args[i]);
if(sent[0] == 1)wait_all(args[i]);
}
}
}
if (OP_diags>2) {
printf(" kernel routine with indirection: adt_calc \n");
}
// get plan
#ifdef OP_PART_SIZE_1
int part_size = OP_PART_SIZE_1;
#else
int part_size = OP_part_size;
#endif
op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers(&cpu_t1, &wall_t1);
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads( );
#else
int nthreads = 1;
#endif
// execute plan
int block_offset = 0;
for (int col=0; col < Plan->ncolors; col++) {
int nblocks = Plan->ncolblk[col];
#pragma omp parallel for
for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
op_x86_adt_calc( blockIdx,
(double *)arg0.data, Plan->ind_maps[0],
Plan->loc_maps[0],
Plan->loc_maps[1],
Plan->loc_maps[2],
Plan->loc_maps[3],
(double *)arg4.data,
(double *)arg5.data,
Plan->ind_sizes,
Plan->ind_offs,
block_offset,
Plan->blkmap,
Plan->offset,
Plan->nelems,
Plan->nthrcol,
Plan->thrcol);
block_offset += nblocks;
}
//set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
for(int i = 0; i<nargs; i++)
if(args[i].argtype == OP_ARG_DAT)
set_dirtybit(args[i]);
//performe any global operations
// - NONE
// update kernel record
op_timers(&cpu_t2, &wall_t2);
op_timing_realloc(1);
//.........这里部分代码省略.........
开发者ID:doru1004,项目名称:OP2-Common,代码行数:101,代码来源:adt_calc_mpi_kernel.cpp
示例13: op_par_loop_update
// host stub function
void op_par_loop_update(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3){
double*arg3h = (double *)arg3.data;
int nargs = 4;
op_arg args[4];
args[0] = arg0;
args[1] = arg1;
args[2] = arg2;
args[3] = arg3;
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timing_realloc(8);
op_timers_core(&cpu_t1, &wall_t1);
OP_kernels[8].name = name;
OP_kernels[8].count += 1;
if (OP_diags>2) {
printf(" kernel routine w/o indirection: update");
}
op_mpi_halo_exchanges_cuda(set, nargs, args);
#ifdef OP_PART_SIZE_8
int part_size = OP_PART_SIZE_8;
#else
int part_size = OP_part_size;
#endif
#ifdef OP_BLOCK_SIZE_8
int nthread = OP_BLOCK_SIZE_8;
#else
int nthread = OP_block_size;
#endif
double arg3_l = arg3h[0];
if (set->size >0) {
//Set up typed device pointers for OpenMP
double* data0 = (double*)arg0.data_d;
int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim;
double* data1 = (double*)arg1.data_d;
int dat1size = getSetSizeFromOpArg(&arg1) * arg1.dat->dim;
double* data2 = (double*)arg2.data_d;
int dat2size = getSetSizeFromOpArg(&arg2) * arg2.dat->dim;
update_omp4_kernel(
data0,
dat0size,
data1,
dat1size,
data2,
dat2size,
&arg3_l,
set->size,
part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread,
nthread);
}
// combine reduction data
arg3h[0] = arg3_l;
op_mpi_reduce_double(&arg3,arg3h);
op_mpi_set_dirtybit_cuda(nargs, args);
if (OP_diags>1) deviceSync();
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
OP_kernels[8].time += wall_t2 - wall_t1;
OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f;
OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f;
OP_kernels[8].transfer += (float)set->size * arg2.size;
}
开发者ID:OP2,项目名称:OP2-Common,代码行数:80,代码来源:update_omp4kernel.cpp
示例14: op_par_loop_res_calc
void op_par_loop_res_calc(char const *name, op_set set,
op_arg arg0,
op_arg arg1 ){
int *arg1h = (int *)arg1.data;
int nargs = 2;
op_arg args[2];
args[0] = arg0;
args[1] = arg1;
int ninds = 1;
int inds[2] = {0,-1};
if (OP_diags>2) {
printf(" kernel routine with indirection: res_calc\n");
}
// get plan
#ifdef OP_PART_SIZE_0
int part_size = OP_PART_SIZE_0;
#else
int part_size = OP_part_size;
#endif
int set_size = op_mpi_halo_exchanges(set, nargs, args);
// initialise timers
double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
op_timing_realloc(0);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads( );
#else
int nthreads = 1;
#endif
// allocate and initialise arrays for global reduction
int arg1_l[1+64*64];
for (int thr=0; thr<nthreads; thr++)
for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int;
if (set->size >0) {
op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
op_timers_core(&cpu_t1, &wall_t1);
// execute plan
int block_offset = 0;
for (int col=0; col < Plan->ncolors; col++) {
if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);
int nblocks = Plan->ncolblk[col];
#pragma omp parallel for
for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
op_x86_res_calc( blockIdx,
(double *)arg0.data,
Plan->ind_map,
Plan->loc_map,
&arg1_l[64*omp_get_thread_num()],
Plan->ind_sizes,
Plan->ind_offs,
block_offset,
Plan->blkmap,
Plan->offset,
Plan->nelems,
Plan->nthrcol,
Plan->thrcol,
set_size);
// combine reduction data
if (col == Plan->ncolors_owned-1) {
for (int thr=0; thr<nthreads; thr++)
for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];
}
block_offset += nblocks;
}
op_timing_realloc(0);
OP_kernels[0].transfer += Plan->transfer;
OP_kernels[0].transfer2 += Plan->transfer2;
}
// combine reduction data
//.........这里部分代码省略.........
开发者ID:Benjamin-git,项目名称:OP2-Common,代码行数:101,代码来源:res_calc_kernel.cpp
示例15: op_par_loop_res
void op_par_loop_res(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3 ){
float *arg3h = (float *)arg3.data;
int nargs = 4;
op_arg args[4] = {arg0,arg1,arg2,arg3};
int ninds = 2;
int inds[4] = {-1,0,1,-1};
if (OP_diags>2) {
printf(" kernel routine with indirection: res \n");
}
// get plan
#ifdef OP_PART_SIZE_0
int part_size = OP_PART_SIZE_0;
#else
int part_size = OP_part_size;
#endif
int set_size = op_mpi_halo_exchanges(set, nargs, args);
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers_core(&cpu_t1, &wall_t1);
if (set->size >0) {
op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
// execute plan
int block_offset = 0;
for (int col=0; col < Plan->ncolors; col++) {
if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);
int nblocks = Plan->ncolblk[col];
#pragma omp parallel for
for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
op_x86_res( blockIdx,
(float *)arg1.data,
(float *)arg2.data,
Plan->ind_map,
Plan->loc_map,
(float *)arg0.data,
(float *)arg3.data,
Plan->ind_sizes,
Plan->ind_offs,
block_offset,
Plan->blkmap,
Plan->offset,
Plan->nelems,
Plan->nthrcol,
Plan->thrcol,
set_size);
block_offset += nblocks;
}
op_timing_realloc(0);
OP_kernels[0].transfer += Plan->transfer;
OP_kernels[0].transfer2 += Plan->transfer2;
}
// combine reduction data
op_mpi_set_dirtybit(nargs, args);
// update kernel record
op_timers_core(&cpu_t2, &wall_t2);
op_timing_realloc(0);
OP_kernels[0].name = name;
OP_kernels[0].count += 1;
OP_kernels[0].time += wall_t2 - wall_t1;
}
开发者ID:xyuan,项目名称:OP2-Common,代码行数:87,代码来源:res_kernel.cpp
示例16: op_par_loop_update
void op_par_loop_update(char const *name, op_set set,
op_arg arg0,
op_arg arg1,
op_arg arg2,
op_arg arg3,
op_arg arg4 ){
int ninds = 0;
int nargs = 5;
op_arg args[5] = {arg0,arg1,arg2,arg3,arg4};
double *arg4h = (double *)arg4.data;
if (OP_diags>2) {
printf(" kernel routine w/o indirection: update \n");
}
// initialise timers
double cpu_t1, cpu_t2, wall_t1, wall_t2;
op_timers(&cpu_t1, &wall_t1);
// set number of threads
#ifdef _OPENMP
int nthreads = omp_get_max_threads( );
#else
int nthreads = 1;
#endif
// allocate and initialise arrays for global reduction
double arg4_l[1+64*64];
for (int thr=0; thr<nthreads; thr++)
for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double;
// execute plan
#pragma omp parallel for
for (int thr=0; thr<nthreads; thr++) {
int start = (set->size* thr )/nthreads;
int finish = (set->size*(thr+1))/nthreads;
op_x86_update( (double *) arg0.data,
(double *) arg1.data,
(double *) arg2.data,
(double *) arg3.data,
arg4_l + thr*64,
start, finish );
}
// combine reduction data
for (int thr=0; thr<nthreads; thr++)
for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64];
//set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
for(int i = 0; i<nargs; i++)
if(args[i].argtype == OP_ARG_DAT)
set_dirtybit(args[i]);
//performe any global operations
for(int i = 0; i<nargs; i++)
if(args[i].argtype == OP_ARG_GBL)
global_reduce(&args[i]);
// update kernel record
op_timers(&cpu_t2, &wall_t2);
op_timing_realloc(4);
OP_kernels[4].name = name;
OP_kernels[4].count += 1;
OP_kernels[4].time += wall_t2 - wall_t1;
OP_kernels[4].transfer += (double)set->size * arg0.size;
OP_kernels[4].transfer += (double)set->size * arg1.size;
OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f;
OP_kernels[4].transfer += (double)set->size * arg3.size;
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:79,代码来源:update_mpi_kernel.cpp
示例17: op_par_loop_adt_calc
-
六六分期app的软件客服如何联系?不知道吗?加qq群【895510560】即可!标题:六六分期
阅读:19234|2023-10-27
-
今天小编告诉大家如何处理win10系统火狐flash插件总是崩溃的问题,可能很多用户都不知
阅读:10000|2022-11-06
-
今天小编告诉大家如何对win10系统删除桌面回收站图标进行设置,可能很多用户都不知道
阅读:8331|2022-11-06
-
今天小编告诉大家如何对win10系统电脑设置节能降温的设置方法,想必大家都遇到过需要
阅读:8701|2022-11-06
-
我们在使用xp系统的过程中,经常需要对xp系统无线网络安装向导设置进行设置,可能很多
阅读:8646|2022-11-06
-
今天小编告诉大家如何处理win7系统玩cf老是与主机连接不稳定的问题,可能很多用户都不
阅读:9668|2022-11-06
-
电脑对日常生活的重要性小编就不多说了,可是一旦碰到win7系统设置cf烟雾头的问题,很
阅读:8632|2022-11-06
-
我们在日常使用电脑的时候,有的小伙伴们可能在打开应用的时候会遇见提示应用程序无法
阅读:8005|2022-11-06
-
今天小编告诉大家如何对win7系统打开vcf文件进行设置,可能很多用户都不知道怎么对win
阅读:8666|2022-11-06
-
今天小编告诉大家如何对win10系统s4开启USB调试模式进行设置,可能很多用户都不知道怎
阅读:7540|2022-11-06
|
请发表评论