深度学习之Caffe完全掌握:配置GPU驱动安装cuda
安装Nvidia驱动
其他博客一般是要求安装你的电脑对应的版本,我直接装了nvidia-367(为了配合cuda8.0),也可以用。
你也可以参照:http://blog.csdn.net/xuzhongxiong/article/details/52717285
root@master# sudo add-apt-repository ppa:xorg-edgers/ppa
root@master# sudo apt-get update
root@master# sudo apt-get install nvidia-367
root@master# sudo apt-get install mesa-common-dev
root@master# sudo apt-get install freeglut3-dev
root@master# nvidia-smi
Sun Feb 11 11:18:43 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111 Driver Version: 384.111 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce 920M Off | 00000000:01:00.0 N/A | N/A |
| N/A 41C P5 N/A / N/A | 129MiB / 2004MiB | N/A Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 Not Supported |
+-----------------------------------------------------------------------------+
安装就成功了
安装cuda8.0
https://developer.nvidia.com/cuda-toolkit
一定要选择8.0,一共1.4G左右。
会有非常长的接受许可信息,一直ENTER直到输入accept。驱动之前已经安装,这里就不要选择安装驱动。其余的都直接默认或者选择是即可。
使用:
root@master# sudo sh cuda_8.0.27_linux.run
root@master# vim /etc/profile
export PATH=/usr/local/cuda-8.0/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64
root@master# source /etc/profile
root@master#
root@master#
测试cuda的Samples
root@master# cd /usr/local/cuda-8.0/samples/1_Utilities/deviceQuery
root@master# sudo make
root@master# ./deviceQuery
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce 920M"
CUDA Driver Version / Runtime Version 9.0 / 8.0
CUDA Capability Major/Minor version number: 3.5
Total amount of global memory: 2004 MBytes (2101542912 bytes)
( 2) Multiprocessors, (192) CUDA Cores/MP: 384 CUDA Cores
GPU Max Clock rate: 954 MHz (0.95 GHz)
Memory Clock rate: 900 Mhz
Memory Bus Width: 64-bit
L2 Cache Size: 524288 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 9.0, CUDA Runtime Version = 8.0, NumDevs = 1, Device0 = GeForce 920M
Result = PASS
sample下的蒙特卡罗模拟程序
cuda的编译语句
root@master# "/usr/local/cuda-8.0"/bin/nvcc -ccbin g++ -I../../common/inc -m64
-gencode arch=compute_20,code=sm_20
-gencode arch=compute_30,code=sm_30
-gencode arch=compute_35,code=sm_35
-gencode arch=compute_37,code=sm_37
-gencode arch=compute_50,code=sm_50
-gencode arch=compute_52,code=sm_52
-gencode arch=compute_60,code=sm_60
-gencode arch=compute_60,code=compute_60
-o MonteCarloMultiGPU.o -c MonteCarloMultiGPU.cpp
程序
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
// includes, project
#include <helper_functions.h> // Helper functions (utilities, parsing, timing)
#include <helper_cuda.h> // helper functions (cuda error checking and initialization)
#include <multithreading.h>
#include "MonteCarlo_common.h"
int *pArgc = NULL;
char **pArgv = NULL;
#ifdef WIN32
#define strcasecmp _strcmpi
#endif
////////////////////////////////////////////////////////////////////////////////
// Common functions
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high)
{
float t = (float)rand() / (float)RAND_MAX;
return (1.0f - t) * low + t * high;
}
/// Utility function to tweak problem size for small GPUs
int adjustProblemSize(int GPU_N, int default_nOptions)
{
int nOptions = default_nOptions;
// select problem size
for (int i=0; i<GPU_N; i++)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)
* deviceProp.multiProcessorCount;
if (cudaCores <= 32)
{
nOptions = (nOptions < cudaCores/2 ? nOptions : cudaCores/2);
}
}
return nOptions;
}
int adjustGridSize(int GPUIndex, int defaultGridSize)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex));
int maxGridSize = deviceProp.multiProcessorCount * 40;
return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize);
}
///////////////////////////////////////////////////////////////////////////////
// CPU reference functions
///////////////////////////////////////////////////////////////////////////////
extern "C" void MonteCarloCPU(
TOptionValue &callValue,
TOptionData optionData,
float *h_Random,
int pathN
);
//Black-Scholes formula for call options
extern "C" void BlackScholesCall(
float &CallResult,
TOptionData optionData
);
////////////////////////////////////////////////////////////////////////////////
// GPU-driving host thread
////////////////////////////////////////////////////////////////////////////////
//Timer
StopWatchInterface **hTimer = NULL;
static CUT_THREADPROC solverThread(TOptionPlan *plan)
{
//Init GPU
checkCudaErrors(cudaSetDevice(plan->device));
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));
//Start the timer
sdkStartTimer(&hTimer[plan->device]);
// Allocate intermediate memory for MC integrator and initialize
// RNG states
initMonteCarloGPU(plan);
// Main computation
MonteCarloGPU(plan);
checkCudaErrors(cudaDeviceSynchronize());
//Stop the timer
sdkStopTimer(&hTimer[plan->device]);
//Shut down this GPU
closeMonteCarloGPU(plan);
cudaStreamSynchronize(0);
printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name);
CUT_THREADEND;
}
static void multiSolver(TOptionPlan *plan, int nPlans)
{
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t *) malloc(nPlans * sizeof(cudaStream_t));
cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t));
for (int i = 0; i < nPlans; i++)
{
checkCudaErrors(cudaSetDevice(plan[i].device));
checkCudaErrors(cudaStreamCreate(&(streams[i])));
checkCudaErrors(cudaEventCreate(&(events[i])));
}
//Init Each GPU
// In CUDA 4.0 we can call cudaSetDevice multiple times to target each device
// Set the device desired, then perform initializations on that device
for (int i=0 ; i<nPlans ; i++)
{
// set the target device to perform initialization on
checkCudaErrors(cudaSetDevice(plan[i].device));
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device));
// Allocate intermediate memory for MC integrator
// and initialize RNG state
initMonteCarloGPU(&plan[i]);
}
for (int i=0 ; i<nPlans ; i++)
{
checkCudaErrors(cudaSetDevice(plan[i].device));
checkCudaErrors(cudaDeviceSynchronize());
}
//Start the timer
sdkResetTimer(&hTimer[0]);
sdkStartTimer(&hTimer[0]);
for (int i=0; i<nPlans; i++)
{
checkCudaErrors(cudaSetDevice(plan[i].device));
//Main computations
MonteCarloGPU(&plan[i], streams[i]);
checkCudaErrors(cudaEventRecord(events[i], streams[i]));
}
for (int i=0; i<nPlans; i++)
{
checkCudaErrors(cudaSetDevice(plan[i].device));
cudaEventSynchronize(events[i]);
}
//Stop the timer
sdkStopTimer(&hTimer[0]);
for (int i=0 ; i<nPlans ; i++)
{
checkCudaErrors(cudaSetDevice(plan[i].device));
closeMonteCarloGPU(&plan[i]);
checkCudaErrors(cudaStreamDestroy(streams[i]));
checkCudaErrors(cudaEventDestroy(events[i]));
}
}
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
#define DO_CPU
#undef DO_CPU
#define PRINT_RESULTS
#undef PRINT_RESULTS
void usage()
{
printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");
printf("Method=threaded: 1 CPU thread for each GPU [default]\n");
printf(" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or newer)\n");
printf("Scaling=strong : constant problem size\n");
printf(" weak : problem size scales with number of available GPUs [default]\n");
}
int main(int argc, char **argv)
{
char *multiMethodChoice = NULL;
char *scalingChoice = NULL;
bool use_threads = true;
bool bqatest = false;
bool strongScaling = false;
pArgc = &argc;
pArgv = argv;
printf("%s Starting...\n\n", argv[0]);
if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
{
bqatest = true;
}
getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice);
getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice);
if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
checkCmdLineFlag(argc, (const char **)argv, "help"))
{
usage();
exit(EXIT_SUCCESS);
}
if (multiMethodChoice == NULL)
{
use_threads = false;
}
else
{
if (!strcasecmp(multiMethodChoice, "threaded"))
{
use_threads = true;
}
else
{
use_threads = false;
}
}
if (use_threads == false)
{
printf("Using single CPU thread for multiple GPUs\n");
}
if (scalingChoice == NULL)
{
strongScaling = false;
}
else
{
if (!strcasecmp(scalingChoice, "strong"))
{
strongScaling = true;
}
else
{
strongScaling = false;
}
}
//GPU number present in the system
int GPU_N;
checkCudaErrors(cudaGetDeviceCount(&GPU_N));
int nOptions = 8 * 1024;
nOptions = adjustProblemSize(GPU_N, nOptions);
// select problem size
int scale = (strongScaling) ? 1 : GPU_N;
int OPT_N = nOptions * scale;
int PATH_N = 262144;
// initialize the timers
hTimer = new StopWatchInterface*[GPU_N];
for (int i=0; i<GPU_N; i++)
{
sdkCreateTimer(&hTimer[i]);
sdkResetTimer(&hTimer[i]);
}
//Input data array
TOptionData *optionData = new TOptionData[OPT_N];
//Final GPU MC results
TOptionValue *callValueGPU = new TOptionValue[OPT_N];
//"Theoretical" call values by Black-Scholes formula
float *callValueBS = new float[OPT_N];
//Solver config
TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
//OS thread ID
CUTThread *threadID = new CUTThread[GPU_N];
int gpuBase, gpuIndex;
int i;
float time;
double delta, ref, sumDelta, sumRef, sumReserve;
printf("MonteCarloMultiGPU\n");
printf("==================\n");
printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed");
printf("Problem scaling = %s\n", strongScaling? "strong" : "weak");
printf("Number of GPUs = %d\n", GPU_N);
printf("Total number of options = %d\n", OPT_N);
printf("Number of paths = %d\n", PATH_N);
printf("main(): generating input data...\n");
srand(123);
for (i=0; i < OPT_N; i++)
{
optionData[i].S = randFloat(5.0f, 50.0f);
optionData[i].X = randFloat(10.0f, 25.0f);
optionData[i].T = randFloat(1.0f, 5.0f);
optionData[i].R = 0.06f;
optionData[i].V = 0.10f;
callValueGPU[i].Expected = -1.0f;
callValueGPU[i].Confidence = -1.0f;
}
printf("main(): starting %i host threads...\n", GPU_N);
//Get option count for each GPU
for (i = 0; i < GPU_N; i++)
{
optionSolver[i].optionCount = OPT_N / GPU_N;
}
//Take into account cases with "odd" option counts
for (i = 0; i < (OPT_N % GPU_N); i++)
{
optionSolver[i].optionCount++;
}
//Assign GPU option ranges
gpuBase = 0;
for (i = 0; i < GPU_N; i++)
{
optionSolver[i].device = i;
optionSolver[i].optionData = optionData + gpuBase;
optionSolver[i].callValue = callValueGPU + gpuBase;
optionSolver[i].pathN = PATH_N;
optionSolver[i].gridSize = adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount);
gpuBase += optionSolver[i].optionCount;
}
if (use_threads || bqatest)
{
//Start CPU thread for each GPU
for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
{
threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);
}
printf("main(): waiting for GPU results...\n");
cutWaitForThreads(threadID, GPU_N);
printf("main(): GPU statistics, threaded\n");
for (i = 0; i < GPU_N; i++)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
printf("Options : %i\n", optionSolver[i].optionCount);
printf("Simulation paths: %i\n", optionSolver[i].pathN);
time = sdkGetTimerValue(&hTimer[i]);
printf("Total time (ms.): %f\n", time);
printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
}
printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
sumDelta = 0;
sumRef = 0;
sumReserve = 0;
for (i = 0; i < OPT_N; i++)
{
BlackScholesCall(callValueBS[i], optionData[i]);
delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
ref = callValueBS[i];
sumDelta += delta;
sumRef += fabs(ref);
if (delta > 1e-6)
{
sumReserve += callValueGPU[i].Confidence / delta;
}
#ifdef PRINT_RESULTS
printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
}
sumReserve /= OPT_N;
}
if (!use_threads || bqatest)
{
multiSolver(optionSolver, GPU_N);
printf("main(): GPU statistics, streamed\n");
for (i = 0; i < GPU_N; i++)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
printf("Options : %i\n", optionSolver[i].optionCount);
printf("Simulation paths: %i\n", optionSolver[i].pathN);
}
time = sdkGetTimerValue(&hTimer[0]);
printf("\nTotal time (ms.): %f\n", time);
printf("\tNote: This is elapsed time for all to compute.\n");
printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
sumDelta = 0;
sumRef = 0;
sumReserve = 0;
for (i = 0; i < OPT_N; i++)
{
BlackScholesCall(callValueBS[i], optionData[i]);
delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
ref = callValueBS[i];
sumDelta += delta;
sumRef += fabs(ref);
if (delta > 1e-6)
{
sumReserve += callValueGPU[i].Confidence / delta;
}
#ifdef PRINT_RESULTS
printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
}
sumReserve /= OPT_N;
}
#ifdef DO_CPU
printf("main(): running CPU MonteCarlo...\n");
TOptionValue callValueCPU;
sumDelta = 0;
sumRef = 0;
for (i = 0; i < OPT_N; i++)
{
MonteCarloCPU(
callValueCPU,
optionData[i],
NULL,
PATH_N
);
delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
ref = callValueCPU.Expected;
sumDelta += delta;
sumRef += fabs(ref);
printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected);
printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
}
printf("L1 norm: %E\n", sumDelta / sumRef);
#endif
printf("Shutting down...\n");
for (int i=0; i<GPU_N; i++)
{
sdkStartTimer(&hTimer[i]);
checkCudaErrors(cudaSetDevice(i));
}
delete[] optionSolver;
delete[] callValueBS;
delete[] callValueGPU;
delete[] optionData;
delete[] threadID;
delete[] hTimer;
printf("Test Summary...\n");
printf("L1 norm : %E\n", sumDelta / sumRef);
printf("Average reserve: %f\n", sumReserve);
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n");
printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);
}