C++ cudaMemcpy函数代码示例

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› C++›C++教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了C++中cudaMemcpy函数的典型用法代码示例。如果您正苦于以下问题：C++ cudaMemcpy函数的具体用法？C++ cudaMemcpy怎么用？C++ cudaMemcpy使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了cudaMemcpy函数的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: step3_gpu

void step3_gpu(int *n) {

  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  Complexf *data, *data_cpu;
  Complexf *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm);

#ifdef INPLACE
  data_cpu=(Complexf*)malloc(alloc_max);
  cudaMalloc((void**) &data, alloc_max);
#else
  data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data_hat, alloc_max);
#endif

  //accfft_init(nthreads);
  setup_time=-MPI_Wtime();

  /* Create FFT plan */
#ifdef INPLACE
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE);
#else
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
  setup_time+=MPI_Wtime();

  /* Warmup Runs */
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif

  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
#ifdef INPLACE
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);
#else
  cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice);
#endif

  MPI_Barrier(c_comm);


  /* Perform forward FFT */
  f_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
  f_time+=MPI_Wtime();

  MPI_Barrier(c_comm);

#ifndef INPLACE
  Complexf *data2_cpu, *data2;
  cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float));
  data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
#endif

  /* Perform backward FFT */
  i_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
  i_time+=MPI_Wtime();

  /* copy back results on CPU and check error*/
#ifdef INPLACE
  cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
  check_err(data_cpu,n,c_comm);
#else
  cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost);
  check_err(data2_cpu,n,c_comm);
#endif


  /* Compute some timings statistics */
  double g_f_time, g_i_time, g_setup_time;
  MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
//.........这里部分代码省略.........

开发者ID:jeffhammond，项目名称:accfft，代码行数:101，代码来源:step3_gpuf.cpp

示例2: copy_device_to_host

void copy_device_to_host(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){

        CHECK_CUDA(cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(h_input, d_input, size, cudaMemcpyDeviceToHost));
}

开发者ID:bearrito，项目名称:learncudathehardway，代码行数:5，代码来源:reduction_helpers.c

示例3: wine_cudaMemcpy

cudaError_t WINAPI wine_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
    WINE_TRACE("\n");
    return cudaMemcpy(dst, src, count, kind);
}

开发者ID:Shelnutt2，项目名称:cuda-wine-wrapper，代码行数:4，代码来源:cudart.c

示例4: init_arrays

void init_arrays(Arrays *arr, FLOAT_TYPE** cu_F,
		 Command_line_opts *opts, Detector_settings *sett)
{

  // Allocates and initializes to zero the data, detector ephemeris
  // and the F-statistic arrays

//  arr->xDat = (double *) calloc (sett->N, sizeof (double));
	CudaSafeCall( cudaMallocHost((void**)&arr->xDat, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xDat, sizeof(double)*sett->N));

//  arr->DetSSB = (double *) calloc (3*sett->N, sizeof (double));
	CudaSafeCall( cudaMallocHost((void**)&arr->DetSSB, sizeof(double)*3*sett->N) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_DetSSB, sizeof(double)*3*sett->N));

  CudaSafeCall ( cudaMalloc((void**)cu_F, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));
  CudaSafeCall ( cudaMemset(*cu_F, 0, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));

  char filename[CHAR_BUFFER_SIZE];
  FILE *data;
  // Input time-domain data handling
  sprintf (filename, "%s/%03d/xdatc_%03d_%03d%s.bin", opts->dtaprefix, opts->ident, \
	   opts->ident, opts->band, opts->label);
  if ((data = fopen (filename, "r")) != NULL) {
    fread ((void *)(arr->xDat), sizeof (double), sett->N, data); // !!! wczytanie danych
    fclose (data);
  } else {
    perror (filename);
    printf("Problem with %s... Exiting...\n", filename);
    exit(1);
  }
  //copy to device
  CudaSafeCall ( cudaMemcpy(arr->cu_xDat, arr->xDat, sizeof(double)*sett->N, cudaMemcpyHostToDevice));


  int Nzeros=0;
  int i;
  // Checking for null values in the data
  for(i=0; i < sett->N; i++)
    if(!arr->xDat[i]) Nzeros++;

  // factor N/(N - Nzeros) to account for null values in the data
  sett->crf0 = (double)sett->N/(sett->N-Nzeros);


  //if white noise...
  if (opts->white_flag)
    sett->sig2 = sett->N*var (arr->xDat, sett->N);
  else
    sett->sig2 = -1.;

  double epsm, phir;

  /*
    ############ Efemerydy ################
  */

  // Ephemeris file handling
  sprintf (filename, "%s/%03d/DetSSB.bin", opts->dtaprefix, opts->ident);
  if ((data = fopen (filename, "r")) != NULL) {
    // Detector position w.r.t solar system baricenter
    // for every datapoint
    fread ((void *)(arr->DetSSB), sizeof (double), 3*sett->N, data);
    // Deterministic phase defining the position of the Earth
    // in its diurnal motion at t=0
    fread ((void *)(&phir), sizeof (double), 1, data);
    // Earth's axis inclination to the ecliptic at t=0
    fread ((void *)(&epsm), sizeof (double), 1, data);
    fclose (data);
  } else {
    perror (filename);
    printf("Problem with %s... Exiting...\n", filename);
    exit(1);
  }

  //copy DetSSB to device
  CudaSafeCall ( cudaMemcpy(arr->cu_DetSSB, arr->DetSSB, sizeof(double)*sett->N*3, cudaMemcpyHostToDevice));


  /*
    ############ Sincos ################
  */


  sett->sphir = sin (phir);
  sett->cphir = cos (phir);
  sett->sepsm = sin (epsm);
  sett->cepsm = cos (epsm);

  //misc. arrays
  //arr->aa = (double*) malloc(sizeof(double)*sett->N);
  //arr->bb = (double*) malloc(sizeof(double)*sett->N);
  CudaSafeCall( cudaMallocHost((void**)&arr->aa, sizeof(double)*sett->N) );
  CudaSafeCall( cudaMallocHost((void**)&arr->bb, sizeof(double)*sett->N) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_aa, sizeof(double)*sett->nfft));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_bb, sizeof(double)*sett->nfft));

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_shft, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_shftf, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_tshift, sizeof(double)*sett->N));
//.........这里部分代码省略.........

开发者ID:mbejger，项目名称:polgraw-allsky，代码行数:101，代码来源:init.c

示例5: main

// Host code
int main(int argc, char** argv)
{
    ParseArguments(argc, argv);
	
	float s_SobelMatrix[25];  
	s_SobelMatrix[0] = 1;
	s_SobelMatrix[1] = 2;
	s_SobelMatrix[2]= 0;
	s_SobelMatrix[3] = -2;
	s_SobelMatrix[4] = -1;
	s_SobelMatrix[5] = 4;
	s_SobelMatrix[6] = 8;
	s_SobelMatrix[7] = 0;
	s_SobelMatrix[8] = -8;
	s_SobelMatrix[9] = -4;
	s_SobelMatrix[10] = 6;
	s_SobelMatrix[11] = 12;
	s_SobelMatrix[12] = 0;
	s_SobelMatrix[13] = -12;
	s_SobelMatrix[14] = -6;
	s_SobelMatrix[15] = 4;
	s_SobelMatrix[16] = 8; 
	s_SobelMatrix[17] = 0;
	s_SobelMatrix[18] = -8;
	s_SobelMatrix[19] =-4;
	s_SobelMatrix[20] =1;
	s_SobelMatrix[21] =2;
	s_SobelMatrix[22] =0;
	s_SobelMatrix[23] =-2;
	s_SobelMatrix[24] =-1;
	
    unsigned char *palete = NULL;
    unsigned char *data = NULL, *out = NULL;
    PPMImage *input_image=NULL, *output_image=NULL;
    output_image = (PPMImage *)malloc(sizeof(PPMImage));
    input_image = readPPM(PPMInFileL);
    printf("Running %s filter\n", Filter);
    out = (unsigned char *)malloc();

    printf("Computing the CPU output\n");
    printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y);
    
	cutilCheckError(cutStartTimer(time_CPU));
	if(FilterMode == SOBEL_FILTER){
	printf("Running Sobel\n");
	CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y);
	}
	else if(FilterMode == HIGH_BOOST_FILTER){
	printf("Running boost\n");
	CPU_Boost(data, out, dib.width, dib.height);
	}
	cutilCheckError(cutStopTimer(time_CPU));
	if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5)
    BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete);
	
	else if(FilterMode == AVERAGE_FILTER)
	BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete);
	
	else if(FilterMode == HIGH_BOOST_FILTER)
	BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete);
	
    printf("Done with CPU output\n");
	printf("CPU execution time %f \n", cutGetTimerValue(time_CPU));
	
	
    printf("Allocating %d bytes for image \n", dib.image_size);
	
    cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) );
    cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) );
    
	// creating space for filter matrix
	cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) );
	
	cutilCheckError(cutStartTimer(time_mem));
	
	cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice);
	
	cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice);
	
	cutilCheckError(cutStopTimer(time_mem));
    
	FilterWrapper(data, dib.width, dib.height);

    // Copy image back to host
	
	cutilCheckError(cutStartTimer(time_mem));
    cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost);
	cutilCheckError(cutStopTimer(time_mem));
	
	printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem));
    printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem)));
	// Write output image   
    BitMapWrite(BMPOutFile, &bmp, &dib, out, palete);

    Cleanup();
}

开发者ID:rtvj，项目名称:Misc，代码行数:97，代码来源:filter_cpu.c

示例6: toHost

 void toHost(T* base) const {
   cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost));
 }

开发者ID:2php，项目名称:fbcunn，代码行数:3，代码来源:HalfPrecTest.cpp

示例7: preProcess

//return types are void since any internal error will be handled by quitting
//no point in returning error codes...
//returns a pointer to an RGBA version of the input image
//and a pointer to the single channel grey-scale output
//on both the host and device
void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
                uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
                unsigned char **d_redBlurred,
                unsigned char **d_greenBlurred,
                unsigned char **d_blueBlurred,
                float **h_filter, int *filterWidth,
                const std::string &filename) {

  //make sure the context initializes ok
  checkCudaErrors(cudaFree(0));

  cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
  if (image.empty()) {
    std::cerr << "Couldn't open file: " << filename << std::endl;
    exit(1);
  }

  cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

  //allocate memory for the output
  imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

  //This shouldn't ever happen given the way the images are created
  //at least based upon my limited understanding of OpenCV, but better to check
  if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
    std::cerr << "Images aren't continuous!! Exiting." << std::endl;
    exit(1);
  }

  *h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
  *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

  const size_t numPixels = numRows() * numCols();
  //allocate memory on the device for both input and output
  checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
  checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
  checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

  //copy input array to the GPU
  checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

  d_inputImageRGBA__  = *d_inputImageRGBA;
  d_outputImageRGBA__ = *d_outputImageRGBA;

  //now create the filter that they will use
  const int blurKernelWidth = 9;
  const float blurKernelSigma = 2.;

  *filterWidth = blurKernelWidth;

  //create and fill the filter we will convolve with
  *h_filter = new float[blurKernelWidth * blurKernelWidth];
  h_filter__ = *h_filter;

  float filterSum = 0.f; //for normalization

  for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
    for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
      float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
      (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
      filterSum += filterValue;
    }
  }

  float normalizationFactor = 1.f / filterSum;

  for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
    for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
      (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
    }
  }

  //blurred
  checkCudaErrors(cudaMalloc(d_redBlurred,    sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMalloc(d_greenBlurred,  sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMalloc(d_blueBlurred,   sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMemset(*d_redBlurred,   0, sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMemset(*d_blueBlurred,  0, sizeof(unsigned char) * numPixels));
}

开发者ID:berant89，项目名称:CUDA-Projects，代码行数:85，代码来源:HW2.cpp

示例8: main

int main(int argc, char **argv)
{
    // Start logs
    printf("%s Starting...\n\n", argv[0]);

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float *h_OutputGPU, *d_Output;

    int dim, pos;
    double delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;

    if (sizeof(INT64) != 8)
    {
        printf("sizeof(INT64) != 8\n");
        return 0;
    }

    cudaDeviceProp deviceProp;
    int dev = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));

    if (((deviceProp.major << 4) + deviceProp.minor) < 0x20)
    {
        fprintf(stderr, "quasirandomGenerator requires Compute Capability of SM 2.0 or higher to run.\n");
        cudaDeviceReset();
        exit(EXIT_WAIVED);
    }

    sdkCreateTimer(&hTimer);

    printf("Allocating GPU memory...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)));

    printf("Allocating CPU memory...\n");
    h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    printf("Initializing QRNG tables...\n\n");
    initQuasirandomGenerator(tableCPU);

    initTableGPU(tableCPU);

    printf("Testing QRNG...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));
    int numIterations = 20;

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        quasirandomGeneratorGPU(d_Output, 0, N);
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS);

    printf("\nReading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("Comparing to the CPU results...\n\n");
    sumDelta = 0;
    sumRef = 0;

    for (dim = 0; dim < QRNG_DIMENSIONS; dim++)
        for (pos = 0; pos < N; pos++)
        {
            ref       = getQuasirandomValue63(pos, dim);
            delta     = (double)h_OutputGPU[dim * N + pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }

    printf("L1 norm: %E\n", sumDelta / sumRef);

    printf("\nTesting inverseCNDgpu()...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N);
    }

    checkCudaErrors(cudaDeviceSynchronize());
//.........这里部分代码省略.........

开发者ID:ziyuhe，项目名称:cuda_project，代码行数:101，代码来源:quasirandomGenerator.cpp

示例9: main

int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int filterWidth;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError = 0.0;
  bool useEpsCheck = false;
  std::string blur_impl = "hw";
  switch (argc) {
    case 2:
      input_file = std::string(argv[1]);
      output_file = "HW2_output.png";
      reference_file = "HW2_reference.png";
      break;
    case 3:
      input_file = std::string(argv[1]);
      output_file = std::string(argv[2]);
      reference_file = "HW2_reference.png";
      break;
    case 4:
      input_file = std::string(argv[1]);
      output_file = std::string(argv[2]);
      reference_file = std::string(argv[3]);
      break;
    case 5:
      input_file = std::string(argv[1]);
      output_file = std::string(argv[2]);
      reference_file = std::string(argv[3]);
      blur_impl = std::string(argv[4]);
      break;
    default:
      std::cerr << "Usage: ./HW2 input_file [output_filename] "
                   "[reference_filename] [blur_impl]]"
                << std::endl;
      exit(1);
  }
  // load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA,
             &d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  GpuTimer timer;
  timer.Start();
  // call the students' code
  if (blur_impl == "hw") {
    your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA,
                       numRows(), numCols(), d_redBlurred, d_greenBlurred,
                       d_blueBlurred, filterWidth);
  } else if (blur_impl == "shared") {
    gaussian_blur_shared(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA,
                       numRows(), numCols(), d_redBlurred, d_greenBlurred,
                       d_blueBlurred, filterWidth);
  }

  timer.Stop();
  cudaDeviceSynchronize();
  checkCudaErrors(cudaGetLastError());
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    // Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!"
              << std::endl;
    exit(1);
  }

  // check results and output the blurred image

  size_t numPixels = numRows() * numCols();
  // copy the output back to the host
  checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__,
                             sizeof(uchar4) * numPixels,
                             cudaMemcpyDeviceToHost));

  std::cerr << "postProcess output...\n";
  postProcess(output_file, h_outputImageRGBA);

  timer.Start();
  referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, numRows(),
                       numCols(), h_filter, filterWidth);
  timer.Stop();
  std::cerr << "referenceCalculation elapsed: " << timer.Elapsed() << " ms\n";

  std::cerr << "postProcess reference...\n";
  postProcess(reference_file, h_outputImageRGBA);

  //  Cheater easy way with OpenCV
  // generateReferenceImage(input_file, reference_file, filterWidth);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError,
                globalError);

//.........这里部分代码省略.........

开发者ID:WangHanbin，项目名称:cs344，代码行数:101，代码来源:main.cpp

示例10: create

void pcl::gpu::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg)
{
    create(sizeBytes_arg);
    cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) );
    cudaSafeCall( cudaDeviceSynchronize() );
}

开发者ID:VictorLamoine，项目名称:pcl，代码行数:6，代码来源:device_memory.cpp

示例11: cudaSafeCall

void pcl::gpu::DeviceMemory::download(void *host_ptr_arg) const
{    
    cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
}

开发者ID:VictorLamoine，项目名称:pcl，代码行数:5，代码来源:device_memory.cpp

示例12: CUDA_SAFE_CALL

DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }

开发者ID:gurkih，项目名称:lammps，代码行数:2，代码来源:Kokkos_CudaSpace.cpp

示例13: main2

int main2(int sockfd)
{
        cufftHandle plan;
        cufftComplex *devPtr;
        cufftReal indata[NX*BATCH];
        cufftComplex data[NX*BATCH];
        int i,timer,j,k;
        char fname[15];
        FILE *f;
	#define BUFSIZE (21*4096*sizeof(int))
	int buffer[BUFSIZE];

        int p,nread;


	f = fopen("21-4096","rb");
	nread=fread(buffer,BUFSIZE,1,f);
	printf("nread=%i\n",nread);
	fclose(f);

        i=0;
        for (j=0;j<BATCH;j++) {
            for (k=0;k<NX;k++) {
                data[j*NX+k].x = buffer[j*NX+k];
                data[j*NX+k].y = 0;
            }
	}


        //f=fopen("y.txt","r");
    /* source data creation */

        //int sockfd = myconnect();
        //printf("connected\n");
	
		

        /* WORKING!!!!!!!!
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"r");
            for (k=0;k<NX;k++) {
                fscanf(f,"%i\n",&p);
                data[j*NX+k].x = p;
                data[j*NX+k].y = 0;
            }
            fclose(f);
	*/
/*
        for(i=  0 ; i < NX*BATCH ; i++){
                //fscanf(f,"%i\n",&p);
                //data[i].x= p;
                data[i].x= 1.0f;
                //printf("%f\n",data[i].x);
                data[i].y = 0.0f;
        }
        //fclose(f)
        */
        //}


        /* creates 1D FFT plan */
        cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);


        /*
        cutCreateTimer(&timer);
        cutResetTimer(timer);
        cutStartTimer(timer);
        */
        
    /* GPU memory allocation */
        cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);

    /* transfer to GPU memory */
        cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);


        /* executes FFT processes */
        cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);

        /* executes FFT processes (inverse transformation) */
       //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);

    /* transfer results from GPU memory */
        cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);

        /* deletes CUFFT plan */
        cufftDestroy(plan);

    /* frees GPU memory */
        cudaFree(devPtr);

        /*
        cudaThreadSynchronize();
        cutStopTimer(timer);
        printf("%f\n",cutGetTimerValue(timer)/(float)1000);
        cutDeleteTimer(timer);
//.........这里部分代码省略.........

开发者ID:chatbot，项目名称:mybci，代码行数:101，代码来源:source2socket.c

示例14: main


//.........这里部分代码省略.........
  manageCudaError();

	r_lists = (results_list *) malloc(MAX_BUS_GPU * sizeof(results_list));

	for (int i=0; i<MAX_BUS_GPU; i++) {
			new_results_list(&r_lists[i], RESULTS);
	}

	k = (uint32_t*)malloc(RESULTS * sizeof(uint32_t));
	l = (uint32_t*)malloc(RESULTS * sizeof(uint32_t));	

  toc();

  int TAM_BUS_GPU=0, NUM_BLOQUES_GPU=0;

  NUM_REP          = atoi(argv[5]);

  tic("Leer de disco");

  while(nextFASTAToken(queries_file, h_Worig + TAM_BUS_GPU * MAXLINE, h_We + TAM_BUS_GPU * MAXLINE, h_nWe + TAM_BUS_GPU)) {

    TAM_BUS_GPU++;

    if (TAM_BUS_GPU == MAX_BUS_GPU) break;

  }

  toc();

  NUM_BLOQUES_GPU = (TAM_BUS_GPU / TAM_BLOQUE_GPU);

  cudaThreadSynchronize();
  tic("CPU -> GPU");
  cudaMemcpy(d_We, h_We, TAM_BUS_GPU * MAXLINE * sizeof(uint8_t), cudaMemcpyHostToDevice);
  manageCudaError();
  cudaMemcpy(d_nWe,  h_nWe,  TAM_BUS_GPU * sizeof(uint64_t), cudaMemcpyHostToDevice);
  manageCudaError();
  cudaThreadSynchronize();
  toc();

	cudaThreadSynchronize();
  tic("GPU Kernel");
  BWExactSearchBackwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_k, d_l, 0, d_O.siz-2, &d_C, &d_C1, &d_O);
  BWExactSearchForwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_ki, d_li, 0, d_Oi.siz-2, &d_C, &d_C1, &d_Oi);
  cudaThreadSynchronize();
  toc();

  cudaThreadSynchronize();
  tic("GPU -> CPU");
  cudaMemcpy(h_k, d_k, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();
  cudaMemcpy(h_l, d_l, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();
  cudaMemcpy(h_ki, d_ki, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();
  cudaMemcpy(h_li, d_li, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();  
  cudaThreadSynchronize();
  toc();

  tic("CPU Vector");
  for (int i=0; i<TAM_BUS_GPU; i++) {
    BWExactSearchVectorBackward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_O.siz-2, h_k2 + MAXLINE*i, h_l2 + MAXLINE*i, &backward);
	  BWExactSearchVectorForward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_Oi.siz-2, h_ki2 + MAXLINE*i, h_li2 + MAXLINE*i, &forward);

  }

开发者ID:josator，项目名称:gnu-bwt-aligner，代码行数:67，代码来源:optimize_speedup_vector.c

示例15: D_MEMCPY_D2H

//-------------------------------------------------------
//copy a buffer from device memory to host memory
//
//param	: des
//param	: src
//param	: size
//-------------------------------------------------------
void D_MEMCPY_D2H(void *des, void *src, size_t size)
{
	CUDA_SAFE_CALL(cudaMemcpy(des, src, size, cudaMemcpyDeviceToHost));
}

开发者ID:xiaobaidemu，项目名称:GPU_Apriori，代码行数:11，代码来源:BenCUDAMem.cpp

示例16: main

int main(int argc, char **argv)
{
    int OPT_N  = 4000000;
    int OPT_SZ = OPT_N * sizeof(float);

    printf("Initializing data...\n");
    
    float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;
    float *d_callResult, *d_putResult;
    float *d_stockPrice, *d_optionStrike, *d_optionYears;

#ifdef HEMI_CUDA_COMPILER
    checkCuda( cudaMallocHost((void**)&callResult,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&putResult,      OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&stockPrice,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionStrike,   OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionYears,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_callResult,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_putResult,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_stockPrice,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionStrike, OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionYears,  OPT_SZ) );
#else
    callResult   = (float*)malloc(OPT_SZ);
    putResult    = (float*)malloc(OPT_SZ);
    stockPrice   = (float*)malloc(OPT_SZ);
    optionStrike = (float*)malloc(OPT_SZ);
    optionYears  = (float*)malloc(OPT_SZ);
#endif

    initOptions(OPT_N, stockPrice, optionStrike, optionYears);
        
    int blockDim = 128; // blockDim, gridDim ignored by host code
    int gridDim  = std::min<int>(1024, (OPT_N + blockDim - 1) / blockDim);

    printf("Running %s Version...\n", HEMI_LOC_STRING);

    StartTimer();

#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaMemcpy(d_stockPrice,   stockPrice,   OPT_SZ, cudaMemcpyHostToDevice) );
    checkCuda( cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice) );
    checkCuda( cudaMemcpy(d_optionYears,  optionYears,  OPT_SZ, cudaMemcpyHostToDevice) );
#else
    d_callResult   = callResult; 
    d_putResult    = putResult;
    d_stockPrice   = stockPrice; 
    d_optionStrike = optionStrike;
    d_optionYears  = optionYears;
#endif

    HEMI_KERNEL_LAUNCH(BlackScholes, gridDim, blockDim, 0, 0,
                       d_callResult, d_putResult, d_stockPrice, d_optionStrike, 
                       d_optionYears, RISKFREE, VOLATILITY, OPT_N);
       
#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost) );
    checkCuda( cudaMemcpy(putResult,  d_putResult,  OPT_SZ, cudaMemcpyDeviceToHost) );
#endif

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    double ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaFree(d_stockPrice) );
    checkCuda( cudaFree(d_optionStrike) );
    checkCuda( cudaFree(d_optionYears) );
    checkCuda( cudaFreeHost(callResult) );
    checkCuda( cudaFreeHost(putResult) );
    checkCuda( cudaFreeHost(stockPrice) );
    checkCuda( cudaFreeHost(optionStrike) );
    checkCuda( cudaFreeHost(optionYears) );
#else
    free(callResult);
    free(putResult);
    free(stockPrice);
    free(optionStrike);
    free(optionYears);
#endif // HEMI_CUDA_COMPILER
}

开发者ID:Oblynx，项目名称:hemi，代码行数:89，代码来源:blackscholes.cpp

示例17: CUDA

 CUDA(const T* base, size_t n) :
 n_(n) {
   cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
   cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice));
 }

开发者ID:2php，项目名称:fbcunn，代码行数:5，代码来源:HalfPrecTest.cpp

示例18: CHECK_GT

void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {

  init_cpu = false;
  init_gpu = false;
  const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param();

  num_iterations_ = meanfield_param.num_iterations();

  CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1.";

  theta_alpha_ = meanfield_param.theta_alpha();
  theta_beta_ = meanfield_param.theta_beta();
  theta_gamma_ = meanfield_param.theta_gamma();

  count_ = bottom[0]->count();
  num_ = bottom[0]->num();
  channels_ = bottom[0]->channels();
  height_ = bottom[0]->height();
  width_ = bottom[0]->width();
  num_pixels_ = height_ * width_;

  LOG(INFO) << "This implementation has not been tested batch size > 1.";

  top[0]->Reshape(num_, channels_, height_, width_);

  // Initialize the parameters that will updated by backpropagation.
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Multimeanfield layer skipping parameter initialization.";
  } else {

    this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix

    // Allocate space for kernel weights.
    this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_));
    this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_));

    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data());
    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data());

    // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available.
    FILE * pFile;
    pFile = fopen("spatial.par", "r");
    CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights.";
    for (int i = 0; i < channels_; i++) {
      fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]);
    }
    fclose(pFile);

    pFile = fopen("bilateral.par", "r");
    CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights.";
    for (int i = 0; i < channels_; i++) {
      fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]);
    }
    fclose(pFile);

    // Initialize the compatibility matrix.
    this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_));
    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data());

    // Initialize it to have the Potts model.
    for (int c = 0; c < channels_; ++c) {
      (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.);
    }
  }

  float spatial_kernel[2 * num_pixels_];
  float *spatial_kernel_gpu_;
  compute_spatial_kernel(spatial_kernel);
  spatial_lattice_.reset(new ModifiedPermutohedral());
  spatial_norm_.Reshape(1, 1, height_, width_);
  Dtype* norm_data_gpu ;
  Dtype*  norm_data;
  // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size.
  switch (Caffe::mode()) {
    case Caffe::CPU:
      norm_data = spatial_norm_.mutable_cpu_data();
      spatial_lattice_->init(spatial_kernel, 2, width_, height_);
      // Calculate spatial filter normalization factors.
      norm_feed_= new Dtype[num_pixels_];
      caffe_set(num_pixels_, Dtype(1.0), norm_feed_);
      // pass norm_feed and norm_data to gpu
      spatial_lattice_->compute(norm_data, norm_feed_, 1);
      bilateral_kernel_buffer_ = new float[5 * num_pixels_];
      init_cpu = true;
      break;
    #ifndef CPU_ONLY
    case Caffe::GPU:
      CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ;
      CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ;
      spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_);
      CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ;
      caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_);
      norm_data_gpu = spatial_norm_.mutable_gpu_data();
      spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); 
      norm_data = spatial_norm_.mutable_cpu_data();
      CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ;
      CUDA_CHECK(cudaFree(spatial_kernel_gpu_));
      init_gpu = true;
      break;
//.........这里部分代码省略.........

开发者ID:AmirooR，项目名称:caffe_video_segmentation，代码行数:101，代码来源:multi_stage_meanfield.cpp

示例19: AllocateResources

void CUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int64 besttime=std::numeric_limits<int64>::max();

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			AllocateResources(numb,numt);
			// clear out any existing error
			cudaError_t err=cudaGetLastError();
			err=cudaSuccess;

			int64 st=GetTimeMillis();

			for(int it=0; it<128*256*2 && err==0; it+=(numb*numt))
			{
				cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice));

				cuda_process_helper(m_devin,m_devout,64,6,numb,numt);

				cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost));

				err=cudaGetLastError();
				if(err!=cudaSuccess)
				{
					printf("CUDA error %d\n",err);
				}
			}

			int64 et=GetTimeMillis();

			printf("Finding best configuration step end (%d,%d) %"PRI64d"ms  prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);

			if((et-st)<besttime && err==cudaSuccess)
			{
				bestb=numb;
				bestt=numt;
				besttime=et-st;
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	AllocateResources(m_numb,m_numt);

}

开发者ID:chancn，项目名称:bitcoin-pool，代码行数:67，代码来源:bitcoinminercuda.cpp