C++ cudaFree函数代码示例

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› C++›C++教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了C++中cudaFree函数的典型用法代码示例。如果您正苦于以下问题：C++ cudaFree函数的具体用法？C++ cudaFree怎么用？C++ cudaFree使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了cudaFree函数的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: quantus_cuda_cleanup

void quantus_cuda_cleanup(quantus_comm<T> *comm)
{
    cudaFree((T *) comm->matrix);
}

开发者ID:thomasluu，项目名称:quantus，代码行数:4，代码来源:quantus_cuda.cpp

示例2: CUDA_CHECK

GPUParams<Dtype>::~GPUParams() {
#ifndef CPU_ONLY
  CUDA_CHECK(cudaFree(data_));
  CUDA_CHECK(cudaFree(diff_));
#endif
}

开发者ID:bbshocking，项目名称:caffe，代码行数:6，代码来源:parallel.cpp

示例3: free

	static void free(void *data) {
	    if (data) {
		// std::cout << "free " << data << std::endl;
		throw_(cudaFree(data));
	    }
	}

开发者ID:asadchev，项目名称:asadchev，代码行数:6，代码来源:allocator.hpp

示例4: cudaFree

TxVectorOptimizationDataCU::~TxVectorOptimizationDataCU() {
  if (devicePtr) {
    cudaFree(devicePtr);
  }
}

开发者ID:NobodyInAmerica，项目名称:libTxHPCG，代码行数:5，代码来源:TxVectorOptimizationDataCU.cpp

示例5: cudaFree

void CloudConstructor::freeGPUPoints() {
	cudaFree(d_resultPoints);
	d_resultPoints = NULL;
}

开发者ID:damonseeley，项目名称:electroland_repos，代码行数:4，代码来源:CloudConstructor.cpp

示例6: main

int
main()
{
	int i;
	struct timeval start, stop;
	FILE *fd;
	char *key;

	cudaSetDevice(0);

	/* Allocate memory */
	if ((key = (char *)malloc(40 * sizeof(char))) == NULL) {
		printf("Malloc failed!\n");
		exit(EXIT_FAILURE);
	}

	cudaMallocHost((void **) &batchKeys,
	    ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
	cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t));
	cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int));
	cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t));

	cudaMalloc((void **) &d_keys,
	    ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
        cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t));
        cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int));
        cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t));

	/* Create 'BATCH_SIZE' number of random keys 
	 * and add them to batch table
	 */
	batchNo = 0;
        batchIndex[0] = 0;
	for(i = 0; i < BATCH_SIZE; i++) { 
		gen_random(key, 30);
		add_to_batch(key, 30);
	}

	/* Start Time (execution + memory) */
#ifdef EXEC_MEM
	gettimeofday(&start, NULL);
#endif // EXEC_MEM
	
	/* MemCpy Host -> Device */
	cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] +
	    strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char),
	    cudaMemcpyHostToDevice);
        cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t),
	    cudaMemcpyHostToDevice);
        cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int),
	    cudaMemcpyHostToDevice);

	/* Start Time (execution only)*/
#ifndef EXEC_MEM
	gettimeofday(&start, NULL);
#endif // EXEC_MEM

	/* Call the kernel */
	CUDAhash(d_keys, d_index, d_len, d_res);

	/* Start Time (execution only)*/
#ifndef EXEC_MEM
	cudaDeviceSynchronize();
	gettimeofday(&stop, NULL);
#endif // EXEC_MEM

	/* MemCpy Device -> Host */
	cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t),
	    cudaMemcpyDeviceToHost);	
	
	/* Start Time (execution + memory) */
#ifdef EXEC_MEM
	gettimeofday(&stop, NULL);
#endif // EXEC_MEM

	
#ifdef DEBUG
	for(i = 0; i < BATCH_SIZE; i++) {
		printf("%s\n", &batchKeys[batchIndex[i]]);
		printf("%u\n", hashedKeys[i]);
	}
#endif // DEBUG

	/* Print Time */
	fd = fopen("log.txt", "a+");
	fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec));
	fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE / 
	    ((double)(((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000);
	fclose(fd);

#ifdef DEBUG
	printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec));
#endif // DEBUG
	
        /* Free memory */
        cudaFree(batchKeys);
	cudaFree(nKeys);
//.........这里部分代码省略.........

开发者ID:deyannis，项目名称:HY527，代码行数:101，代码来源:hash.c

示例7: CUDA_SAFE_CALL

void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
  try {
    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
  } catch(...) {}
}

开发者ID:gmackey，项目名称:kokkos，代码行数:6，代码来源:Kokkos_CudaSpace.cpp

示例8: calculateOnGPU


//.........这里部分代码省略.........
    //move constants variables to constant cuda memory
    setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength,
                queryLength, gapOpen, gapExtension, maxScore, partQuerySize,
                U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG,
                U2::SmithWatermanAlgorithm::STOP);

    size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
    u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
    // start main loop
    for (int i = 0; i < queryDevider; i++) {

        calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib,
            g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax,
            g_FdataUp, g_directionsUp, g_directionsRec,
            g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);

        cudaError hasErrors = cudaThreadSynchronize();

        if (hasErrors != 0) {
            u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
        }

        //revert arrays
        g_HdataTmp = g_HdataRec;
        g_HdataRec = g_HdataUp;
        g_HdataUp = g_HdataTmp;

        g_HdataTmp = g_directionsRec;
        g_directionsRec = g_directionsUp;
        g_directionsUp = g_HdataTmp;
    }

    //Copy vectors on host and find actual results
    cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
    cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
        cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
    }

    QList<resType> pas;
    resType res;
    for (int j = 0; j < (sizeRow); j++) {
        if (tempRow[j] >= maxScore) {
            res.refSubseq.startPos = directionRow[j];
            res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
            res.score = tempRow[j];
            if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
                qint32 pairAlignOffset = 0;

                qint32 row = backtraceBegins[2 * j];
                qint32 column = backtraceBegins[2 * j + 1];
                while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
                    if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
                        row--;
                        column--;
                    } else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
                        column--;
                    } else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
                        row--;
                    }
                    if(0 >= row || 0 >= column) {
                        break;
                    }
                }
                res.patternSubseq.startPos = row;
                res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
            }

            pas.append(res);
        }
    }

    //deallocation memory
    cudaFree(g_seqLib);
    cudaFree(g_queryProfile);
    cudaFree(g_HdataMax);
    cudaFree(g_HdataUp);
    cudaFree(g_HdataRec);
    cudaFree(g_FdataUp);
    cudaFree(g_directionsUp);
    cudaFree(g_directionsMax);
    cudaFree(g_directionsRec);

    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaFree(g_directionsMatrix);
        cudaFree(g_backtraceBegins);
    }

    delete[] tempRow;
    delete[] directionRow;
    delete[] zerroArr;
    delete[] globalMatrix;
    delete[] backtraceBegins;

    return pas;
}

开发者ID:ugeneunipro，项目名称:ugene，代码行数:101，代码来源:sw_cuda_cpp.cpp

示例9:

 ~curandStateManager()
 {
     //if(_state != NULL) memFree((char*)_state);
     if(_state != NULL) CUDA_CHECK(cudaFree(_state));
 }

开发者ID:hxiaox，项目名称:arrayfire，代码行数:5，代码来源:random.hpp

示例10: sci_gpuLU


//.........这里部分代码省略.........
                default : throw "First option argument must be 0 or 1 or 2.";
            }

            switch((int)option[1])
            {
                case 0 :    // Don't keep the data input on Device.
                {
                    if(inputType_A == sci_matrix)
                    {
                        status = cublasFree(d_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                        d_A = NULL;
                    }
                    break;
                }
                case 1 :    // Keep data of the fisrt argument on Device and return the Device pointer.
                {
                    if(inputType_A == sci_matrix)
                    {
                        gpuMat_CUDA* dptr;
                        gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
                        dptr=new gpuMat_CUDA(tmp);
						dptr->useCuda = true;
                        dptr->ptr->set_ptr((double*)d_A);
                        if(bComplex_A)
                            dptr->complex=TRUE;
                        else
                            dptr->complex=FALSE;

                        sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
                        if(sciErr.iErr) throw sciErr;
                        LhsVar(posOutput)=Rhs+posOutput;
                    }
                    else
                        throw "The first input argument is already a GPU variable.";

                    posOutput++;
                    break;
                }

                default : throw "Second option argument must be 0 or 1.";
            }
            // Shutdown
            status = cublasShutdown();
            if (status != CUBLAS_STATUS_SUCCESS) throw status;
        }
        #endif

        #ifdef WITH_OPENCL
        if (!useCuda())
        {
            throw "not implemented with OpenCL.";
        }
        #endif
        if(Rhs == 1)
        {
            free(option);
            option = NULL;
        }

        if(posOutput < Lhs+1)
            throw "Too many output arguments.";

        if(posOutput > Lhs+1)
            throw "Too few output arguments.";

        PutLhsVar();
        return 0;
    }
    catch(const char* str)
    {
        Scierror(999,"%s\n",str);
    }
    catch(SciErr E)
    {
        printError(&E, 0);
    }
    #ifdef WITH_CUDA
    catch(cudaError_t cudaE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
    }
    catch(cublasStatus CublasE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
    }
    if (useCuda())
    {
        if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
    }
    #endif
    #ifdef WITH_OPENCL
    if (!useCuda())
    {
        Scierror(999,"not implemented with OpenCL.\n");
    }
    #endif
    if(Rhs == 1 && option != NULL) free(option);
    return EXIT_FAILURE;
}

开发者ID:dawuweijun，项目名称:scigpgpu，代码行数:101，代码来源:sci_gpuLU.cpp

示例11: main


//.........这里部分代码省略.........
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < n2; ++i)
    {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }

    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);

    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    if (error_norm / ref_norm < 1e-6f)
    {
        printf("simpleCUBLAS test passed.\n");
        exit(EXIT_SUCCESS);
    }
    else
    {
        printf("simpleCUBLAS test failed.\n");
        exit(EXIT_FAILURE);
    }
}

开发者ID:intersense，项目名称:ox-cuda，代码行数:101，代码来源:simpleCUBLAS.cpp

示例12: main


//.........这里部分代码省略.........
			
			int sem_status = sem_wait(sem1);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}			

			sem_status = sem_post(sem2);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}
		}

		// At this point two processes are synchronized.

		config.step++;
		
		// Reassign porcesses' input data segments to show some
		// possible manipulation on shared memory.
		// Here we perform cyclic shift of data pointers.
		config.idevice++;
		config.idevice %= ndevices + 1;
		config.inout_cpu = inout +  config.idevice * np;
	}

	// Release device buffers.
	if (worker)
	{
		cuda_status = cudaFree(config.in_dev);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
		cuda_status = cudaFree(config.out_dev);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
	}
	else
	{
		free(config.in_dev);
		free(config.out_dev);
	}
	
	printf("Device %d deinitialized py process %d\n", config.idevice, pid);

	// On master process perform results check:
	// compare each GPU result to CPU result.
	if (master)
	{
		float* control = inout + np * ndevices;
		for (int idevice = 0; idevice < ndevices; idevice++)
		{
			// Find the maximum abs difference.
			int maxi = 0, maxj = 0;
			float maxdiff = fabs(control[0] - (inout + idevice * np)[0]);

开发者ID:7633，项目名称:msu-cuda-course，代码行数:67，代码来源:shmem_mmap_cuda.c

示例13:

OsdCudaTable::~OsdCudaTable() {

    if (_devicePtr) cudaFree(_devicePtr);
}

开发者ID:Len3d，项目名称:OpenSubdiv，代码行数:4，代码来源:cudaComputeContext.cpp

示例14: main

int main(int argc, char *argv[])
{
    // needed to work correctly with piped benchmarkrunner
    setlinebuf(stdout);
    setlinebuf(stdin);

    int n_indices = 1;
    int n_dimensions = 1;
    char inBuf[200]; // ridiculously large input buffer.
    
    bool isFirst = true;

  do {

    // Allocate memory for the arrays
    int *h_indices = 0;
    double        *h_outputGPU  = 0;

    try
    {
        h_indices = new int [n_indices * n_dimensions];
        h_outputGPU  = new double [n_indices * n_dimensions];
    }
    catch (std::exception e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    int *d_indices;
    double        *d_output;

    try
    {
        cudaError_t cudaResult;
        cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int));

        if (cudaResult != cudaSuccess)
        {
            throw std::runtime_error(cudaGetErrorString(cudaResult));
        }
    }
    catch (std::runtime_error e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    // Initialize the indices (done on the host)
    for(int i = 0; i < n_indices; i++) {
      h_indices[i] = i;
    }

    // Copy the indices to the device
    cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    // Execute the QRNG on the device
    int n_vec;
    sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec);

    cudaDeviceSynchronize();

    cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost);

    // Cleanup and terminate
    delete h_indices;
    cudaFree(d_indices);
    cudaFree(d_output);

    if(!isFirst) {
      printf("RESULT ");

      for(int i = 0; i < std::min(n_indices,10); i++)
        printf("%f ", h_outputGPU[i]);

      printf("\n");
    }
    else {
      printf("OK\n");
      isFirst = false;
    }

    delete h_outputGPU;

      fgets(inBuf, 200, stdin);

      if (sscanf(inBuf, "%u", &n_indices) == 0)
      {
        // if input is not a number, it has to be "EXIT"
        if (strncmp("EXIT",inBuf,4)==0)
        {
          printf("OK\n");
          break;
        }
        else
        {
          printf("ERROR. Bad input: %s\n", inBuf);
//.........这里部分代码省略.........

开发者ID:HIPERFIT，项目名称:vectorprogramming，代码行数:101，代码来源:sobol.cpp

示例15: gpuErrchk

PhysicsProcessor::~PhysicsProcessor(void)
{
	gpuErrchk(cudaFree(d_V));
}

开发者ID:Aloalo，项目名称:RTRT，代码行数:4，代码来源:PhysicsProcessor.cpp

示例16: exp2

void ControlCubeCache::_reSizeCache()
{
	_nLevels = _nextnLevels;
	_levelCube = _nextLevelCube;
	_offset	= _nextOffset;
	_nextnLevels = 0;
	_nextLevelCube = 0;

	_dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC;

	_sizeElement = pow(_dimCube, 3); 

	int dimV = exp2(_nLevels);
	_minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels);
	_maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels);

	int dc = exp2(_nLevels - _levelCube);
	vmml::vector<3,int> mn = _cpuCache->getMinCoord();
	vmml::vector<3,int> mx = _cpuCache->getMaxCoord();
	_maxC = mx - mn;
	if ((mx.x() - mn.x()) % dc != 0)
		_maxC[0] += dc;
	if ((mx.y() - mn.y()) % dc != 0)
		_maxC[1] += dc;
	if ((mx.z() - mn.z()) % dc != 0)
		_maxC[2] += dc;

	if (cudaSuccess != cudaSetDevice(_device))
	{
		std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}
	if (_memory != 0)
		if (cudaSuccess != cudaFree((void*)_memory))
		{                                                                                               
			std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
			throw;
		}
	size_t total = 0;
	size_t free = 0;

	if (cudaSuccess != cudaMemGetInfo(&free, &total))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	float memorySize = (0.80f*free); // Get 80% of free memory

	_maxNumCubes = memorySize/ (_sizeElement*sizeof(float));
	if (_maxNumCubes == 0)
	{
		std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl;
		throw;
	}

	if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float)))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	_freeSlots = _maxNumCubes;

	ControlElementCache::_reSizeCache();
}

开发者ID:carlosduelo，项目名称:eqMivtRefactor，代码行数:66，代码来源:controlCubeCache.cpp

示例17: mpla_redistribute_vector_for_dgesv

void mpla_redistribute_vector_for_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_matrix* A, struct mpla_instance* instance)
{
	// attention: this code does no correctness check for the input data



//	b_redist->vec_row_count = b->vec_row_count;
//
//	// allocating memory for process-wise vector information
//	vector->proc_row_count = new int*[instance->proc_rows];
//	vector->proc_row_offset = new int*[instance->proc_rows];
//	for (int i=0; i<instance->proc_rows; i++)
//	{
//		b_redist->proc_row_count[i] = new int[instance->proc_cols];
//		b_redist->proc_row_offset[i] = new int[instance->proc_cols];
//	}
//
//	// set sizes of 
//	for (int i=0; i<instance->proc_rows; i++)
//	{
//		for (int j=0; j<instance->proc_cols; j++)
//		{
//			b_redist->proc_row_count[i][j] = A->proc_col_count[i][j];
//			b_redist->proc_row_offset[i][j] = A->proc_col_offset[i][j];
//		}
//	}
//
//	// retrieving local data for current process
//	b_redist->cur_proc_row_count = A->cur_proc_col_count;
//	b_redist->cur_proc_row_offset = A->cur_proc_col_offset;
//
//	// allocating temporary vector storage
//	cudaMalloc((void*)&(b_redist->data), sizeof(double)*b_redist->cur_proc_row_count);

	// WARNING: The following code is not efficient for a strong parallelization !!!!!


	// create sub-communicator for each process column
	int remain_dims[2];
	remain_dims[0]=1;
	remain_dims[1]=0;
	MPI_Comm column_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
	int column_rank;
	MPI_Comm_rank(column_comm, &column_rank);
	
	// columnwise creation of the full vector
	double* full_vector;
	int* recvcounts = new int[instance->proc_rows];
	int* displs = new int[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col];
		displs[i] = b->proc_row_offset[i][instance->cur_proc_col];
	}
	cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

	// extract column-wise local part of full vector
	cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// memory cleanup
	cudaFree(full_vector);

	MPI_Comm_free(&column_comm);
}

开发者ID:zaspel，项目名称:MPLA，代码行数:68，代码来源:mpla.cpp

示例18: cudaFree

RealKernel::~RealKernel()
{	delete[] data;
	#ifdef GPU_ENABLED
	cudaFree(dataGpu);
	#endif
}

开发者ID:yalcinozhabes，项目名称:pythonJDFTx，代码行数:6，代码来源:ScalarField.cpp

示例19: run_2D_GLOBAL_MEMORY

void run_2D_GLOBAL_MEMORY()
{
	int arrayWidth	= 4;
	int arrayHeight	= 4;

	bool SEQ = true; 

	/* Host allocation */
	float* inArr_1_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
	float* inArr_2_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
	float* outArr_H	= (float*) malloc(arrayWidth * arrayHeight * sizeof(float));

	/* Fill arrays */
	int index = 0; 
	if (SEQ)
	{
		int ctr = 0; 
		for(int j = 0; j < (arrayHeight); j++)
		{
			for(int i = 0; i < (arrayWidth); i++)
			{
				index = ((j * arrayWidth) + i);

				inArr_1_H[index] = (float) ctr++;  
				inArr_2_H[index] = (float) ctr++; 
				outArr_H[index] = (float) 0;
			}
		}
	}
	else 
	{
		for(int j = 0; j < (arrayHeight); j++)
		{
			for(int i = 0; i < (arrayWidth); i++)
			{
				index = ((j * arrayWidth) + i);

				inArr_1_H[index] =  (float)rand()/(float)RAND_MAX;
				inArr_2_H[index] = (float)rand()/(float)RAND_MAX;
				outArr_H[index] = 0; 
			}
		}
	}
	    
	/* Print host arrays */ 
	printf("inArr_1_H \n");
	print_2D_Array(inArr_1_H, arrayWidth, arrayHeight);
	printf("inArr_2_H \n");
	print_2D_Array(inArr_2_H, arrayWidth, arrayHeight);

	/* Device allocation + <__pitch> */
	float *inArr_1_D, *inArr_2_D, *outArr_D;
	size_t __pitch;
	cudaMallocPitch((void**)&inArr_1_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
	cudaMallocPitch((void**)&inArr_2_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
	cudaMallocPitch((void**)&outArr_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);

	/* Print __pitch */
	printf("__pitch %d \n", (__pitch/sizeof(float))); 

	/* Uploading data */ 
	cudaMemcpy2D(inArr_1_D, __pitch, inArr_1_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
	cudaMemcpy2D(inArr_2_D, __pitch, inArr_2_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
 
	/* Gridding */
	dim3 __numBlocks(1,1,1);
	dim3 __numThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
	__numBlocks.x = ((arrayWidth / BLOCK_SIZE) + (((arrayWidth) % BLOCK_SIZE) == 0 ? 0:1));
	__numBlocks.y = ((arrayHeight / BLOCK_SIZE) + (((arrayHeight) % BLOCK_SIZE) == 0 ? 0:1));

	/* Kernel invokation */
	add_2D_Array(inArr_1_D, inArr_2_D, outArr_D, arrayWidth, arrayHeight, __pitch, __numBlocks, __numThreadsPerBlock); 

	/* Synchronization */
	cudaThreadSynchronize();

	/* Download result */
	cudaMemcpy2D(outArr_H, arrayHeight * sizeof(float), outArr_D, __pitch, arrayHeight * sizeof(float), arrayWidth, cudaMemcpyDeviceToHost);

	/* Free device arrays */
	cudaFree(inArr_1_D);
	cudaFree(inArr_2_D);
	cudaFree(outArr_D);

	/* Display results */
	printf("outArr \n");
	print_2D_Array(outArr_H, arrayWidth, arrayHeight);
}

开发者ID:wow2006，项目名称:cuYURI，代码行数:88，代码来源:globalMem_2D.cpp

示例20: set_size

    void gpu_data::
    set_size(
        size_t new_size
    )
    {
        if (new_size == 0)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = 0;
            host_current = true;
            device_current = true;
            device_in_use = false;
            data_host.reset();
            data_device.reset();
        }
        else if (new_size != data_size)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = new_size;
            host_current = true;
            device_current = true;
            device_in_use = false;

            try
            {
                CHECK_CUDA(cudaGetDevice(&the_device_id));

                // free memory blocks before we allocate new ones.
                data_host.reset();
                data_device.reset();

                void* data;
                CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
                // Note that we don't throw exceptions since the free calls are invariably
                // called in destructors.  They also shouldn't fail anyway unless someone
                // is resetting the GPU card in the middle of their program.
                data_host.reset((float*)data, [](float* ptr){
                    auto err = cudaFreeHost(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
                data_device.reset((float*)data, [](float* ptr){
                    auto err = cudaFree(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                if (!cuda_stream)
                {
                    cudaStream_t cstream;
                    CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
                    cuda_stream.reset(cstream, [](void* ptr){
                        auto err = cudaStreamDestroy((cudaStream_t)ptr);
                        if(err!=cudaSuccess)
                            std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                    });
                }

            }
            catch(...)
            {
                set_size(0);
                throw;
            }
        }
    }

开发者ID:davisking，项目名称:dlib，代码行数:82，代码来源:gpu_data.cpp

注：本文中的cudaFree函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。