@HaomingJiang
2017-11-21T22:40:56.000000Z
字数 7523
阅读 1626
Haoming Jiang
PetscLogDouble v1,v2,elapsed_time;v1 = 0;for (i = 0; i < numSteps; i++) {if (!(i % 1000)) {ierr = PetscGetTime(&v2);CHKERRQ(ierr);elapsed_time = v2-v1;h_globalAvg = 0.;cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);average_distance<<<grid, block>>>(x, xInit);cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);ierr = PetscGetTime(&v1);CHKERRQ(ierr);}cerr = cudaMemset(forces, 0, h_dim * h_numParticles * sizeof(PetscReal)); CUDA_CHK(cerr);compute_forces<<<grid, block>>>(x, forces);sum_noise_and_forces<<<grid, block>>>(x, forces, randState);cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);}if ((i % 1000) != 1) {ierr = PetscGetTime(&v2);CHKERRQ(ierr);elapsed_time = v2-v1;h_globalAvg = 0.;cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);average_distance<<<grid, block>>>(x, xInit);cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);ierr = PetscGetTime(&v1);CHKERRQ(ierr);}
#SBATCH -J ex06-k80 # Job name#SBATCH -p GPU-shared # Queue (RM, RM-shared, GPU, GPU-shared)#SBATCH -N 1 # Number of nodes#SBATCH --gres=gpu:k80:1 # GPU type and amount#SBATCH -t 00:10:00 # Time limit hrs:min:sec#SBATCH -o ex06-k80-%j.out # Standard output and error logmodule use /home/tisaac/opt/modulefilesmodule load petsc/cse6230-doublemodule load cudaexport PGI_ACC_TIME=1make ex06git rev-parse HEADgit diff-filesnvprof ./ex06 -num_steps 1000
==31674== NVPROF is profiling process 31674, command: ./ex06 -num_steps 1000Testing 10000 particles in 3 dimensions with steric repulsion:particle mass: 1.particle radius: 1.k_repulsion: 100.box length: 68.time step: 0.0001number of steps: 1000.Average distance traveled at time 0.: 0.;Petsctimer: 9.53674e-07Average distance traveled at time 0.1: 0.775785;Petsctimer: 62.338==31674== Profiling application: ./ex06 -num_steps 1000==31674== Profiling result:Time(%) Time Calls Avg Min Max Name99.91% 61.8745s 1000 61.874ms 54.585ms 62.356ms compute_forces(double*, double*)0.08% 48.273ms 1000 48.272us 19.872us 74.815us sum_noise_and_forces(double*, double*, curandStateXORWOW*)0.01% 7.6320ms 1 7.6320ms 7.6320ms 7.6320ms setup_kernel(curandStateXORWOW*, unsigned long)0.00% 1.1592ms 1000 1.1590us 1.0550us 11.616us [CUDA memset]0.00% 772.92us 2 386.46us 370.55us 402.36us average_distance(double*, double*)0.00% 46.592us 1 46.592us 46.592us 46.592us initialize_points(double*, curandStateXORWOW*)0.00% 14.080us 11 1.2800us 1.1840us 1.7280us [CUDA memcpy HtoD]0.00% 6.3030us 1 6.3030us 6.3030us 6.3030us [CUDA memcpy DtoD]0.00% 5.9520us 2 2.9760us 2.9760us 2.9760us [CUDA memcpy DtoH]==31674== API calls:Time(%) Time Calls Avg Min Max Name98.09% 61.9910s 1002 61.867ms 361.17us 66.332ms cudaDeviceSynchronize1.35% 851.75ms 8 106.47ms 6.8670us 850.27ms cudaFree0.29% 183.48ms 1000 183.48us 10.400us 4.1659ms cudaMemset0.22% 139.96ms 2004 69.838us 5.3140us 18.368ms cudaLaunch0.02% 11.414ms 10 1.1414ms 5.8890us 7.6609ms cudaMemcpyToSymbol0.02% 10.214ms 5008 2.0390us 136ns 1.7300ms cudaSetupArgument0.01% 5.3405ms 2004 2.6640us 166ns 151.62us cudaConfigureCall0.00% 1.7006ms 178 9.5530us 130ns 351.41us cuDeviceGetAttribute0.00% 1.4692ms 7 209.88us 8.9870us 561.43us cudaMalloc0.00% 823.24us 1 823.24us 823.24us 823.24us cudaGetDeviceProperties0.00% 601.29us 2 300.65us 214.85us 386.44us cuDeviceTotalMem0.00% 152.27us 2 76.137us 4.9450us 147.33us cudaThreadSynchronize0.00% 145.37us 2 72.686us 71.796us 73.577us cuDeviceGetName0.00% 49.641us 2 24.820us 23.784us 25.857us cudaMemcpyFromSymbol0.00% 47.484us 2 23.742us 21.850us 25.634us cudaMemcpy0.00% 12.186us 16 761ns 456ns 3.5290us cudaEventCreateWithFlags0.00% 10.923us 16 682ns 418ns 1.9100us cudaEventDestroy0.00% 7.9800us 1 7.9800us 7.9800us 7.9800us cudaSetDeviceFlags0.00% 6.0330us 11 548ns 282ns 2.2010us cudaDeviceGetAttribute0.00% 3.2600us 4 815ns 250ns 2.1460us cuDeviceGetCount0.00% 2.9700us 1 2.9700us 2.9700us 2.9700us cudaGetDevice0.00% 1.3480us 4 337ns 217ns 618ns cuDeviceGet0.00% 537ns 1 537ns 537ns 537ns cuInit0.00% 488ns 1 488ns 488ns 488ns cuDriverGetVersion
We can use share memory to store the location curent praticle. So that it does not need to read from global memory evry time. It somehow dealwith the coalesced memory access problem.
/* Compute the forces between two particles:- one thread to update each particle */__global__ void compute_forces(PetscReal *x, PetscReal *forces){int lid = threadIdx.x;int id = threadIdx.x + blockIdx.x * blockDim.x;int gridSize = blockDim.x * gridDim.x;int i, j, k;__shared__ double *localx;if (!lid) {localx = (double *) malloc(blockDim.x * dim * sizeof(double));}__syncthreads();for (i = id; i < numParticles; i += gridSize) {for (k = 0; k < dim; k++){localx[k*blockDim.x+lid] = x[dim * i + k];}for (j = 0; j < numParticles; j++) {double dist2 = 0.;if (i == j) continue;for (k = 0; k < dim; k++) {double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);dist2 += disp * disp;}if (dist2 < 4. * a * a) {double dist = sqrt(dist2);double f = krepul * (2. - dist);for (k = 0; k < dim; k++) {double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);forces[dim * i + k] += f * disp / dist;}}}}if (!lid) {free(localx);}}
We can see the significant improvement, i.e. reduce the time from 62s to 27s.
==10405== NVPROF is profiling process 10405, command: ./ex06 -num_steps 1000Testing 10000 particles in 3 dimensions with steric repulsion:particle mass: 1.particle radius: 1.k_repulsion: 100.box length: 68.time step: 0.0001number of steps: 1000.Average distance traveled at time 0.: 0.;Petsctimer: 9.53674e-07Average distance traveled at time 0.1: 0.775785;Petsctimer: 27.553==10405== Profiling application: ./ex06 -num_steps 1000==10405== Profiling result:Time(%) Time Calls Avg Min Max Name99.86% 27.4698s 1000 27.470ms 27.355ms 27.578ms compute_forces(double*, double*)0.12% 33.281ms 1000 33.280us 22.144us 47.135us sum_noise_and_forces(double*, double*, curandStateXORWOW*)0.01% 3.9751ms 1 3.9751ms 3.9751ms 3.9751ms setup_kernel(curandStateXORWOW*, unsigned long)0.00% 1.1581ms 1000 1.1580us 1.1200us 1.6320us [CUDA memset]0.00% 424.95us 2 212.48us 209.28us 215.68us average_distance(double*, double*)0.00% 29.696us 1 29.696us 29.696us 29.696us initialize_points(double*, curandStateXORWOW*)0.00% 14.656us 11 1.3320us 1.2160us 1.7600us [CUDA memcpy HtoD]0.00% 6.0800us 2 3.0400us 3.0080us 3.0720us [CUDA memcpy DtoH]0.00% 3.6800us 1 3.6800us 3.6800us 3.6800us [CUDA memcpy DtoD]==10405== API calls:Time(%) Time Calls Avg Min Max Name97.37% 27.5102s 1002 27.455ms 212.97us 27.721ms cudaDeviceSynchronize2.36% 667.97ms 8 83.496ms 20.190us 666.12ms cudaFree0.14% 40.756ms 2004 20.337us 5.2480us 17.157ms cudaLaunch0.07% 18.939ms 1000 18.939us 9.0580us 236.52us cudaMemset0.03% 7.9732ms 10 797.32us 6.2620us 3.9868ms cudaMemcpyToSymbol0.01% 2.3126ms 178 12.992us 127ns 654.55us cuDeviceGetAttribute0.00% 1.2667ms 7 180.95us 8.3140us 334.40us cudaMalloc0.00% 1.2197ms 5008 243ns 135ns 10.188us cudaSetupArgument0.00% 943.42us 1 943.42us 943.42us 943.42us cudaGetDeviceProperties0.00% 830.14us 2004 414ns 166ns 10.570us cudaConfigureCall0.00% 478.86us 2 239.43us 220.98us 257.88us cuDeviceTotalMem0.00% 183.66us 2 91.828us 90.457us 93.199us cuDeviceGetName0.00% 73.310us 2 36.655us 25.746us 47.564us cudaMemcpyFromSymbol0.00% 47.385us 2 23.692us 21.845us 25.540us cudaMemcpy0.00% 23.497us 16 1.4680us 1.0820us 3.4840us cudaEventDestroy0.00% 22.304us 2 11.152us 10.900us 11.404us cudaThreadSynchronize0.00% 12.378us 16 773ns 458ns 3.5040us cudaEventCreateWithFlags0.00% 7.7670us 1 7.7670us 7.7670us 7.7670us cudaSetDeviceFlags0.00% 5.5410us 11 503ns 271ns 1.9720us cudaDeviceGetAttribute0.00% 3.3300us 4 832ns 213ns 2.3060us cuDeviceGetCount0.00% 2.7160us 1 2.7160us 2.7160us 2.7160us cudaGetDevice0.00% 1.4270us 4 356ns 275ns 590ns cuDeviceGet0.00% 591ns 1 591ns 591ns 591ns cuDriverGetVersion0.00% 572ns 1 572ns 572ns 572ns cuInit