[关闭]
@HaomingJiang 2017-11-21T22:40:56.000000Z 字数 7523 阅读 1166

CSE6230 Exercise 06

Haoming Jiang


Performance Assessment Script

  1. PetscLogDouble v1,v2,elapsed_time;
  2. v1 = 0;
  3. for (i = 0; i < numSteps; i++) {
  4. if (!(i % 1000)) {
  5. ierr = PetscGetTime(&v2);CHKERRQ(ierr);
  6. elapsed_time = v2-v1;
  7. h_globalAvg = 0.;
  8. cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);
  9. average_distance<<<grid, block>>>(x, xInit);
  10. cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
  11. cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);
  12. ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);
  13. ierr = PetscGetTime(&v1);CHKERRQ(ierr);
  14. }
  15. cerr = cudaMemset(forces, 0, h_dim * h_numParticles * sizeof(PetscReal)); CUDA_CHK(cerr);
  16. compute_forces<<<grid, block>>>(x, forces);
  17. sum_noise_and_forces<<<grid, block>>>(x, forces, randState);
  18. cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
  19. }
  20. if ((i % 1000) != 1) {
  21. ierr = PetscGetTime(&v2);CHKERRQ(ierr);
  22. elapsed_time = v2-v1;
  23. h_globalAvg = 0.;
  24. cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);
  25. average_distance<<<grid, block>>>(x, xInit);
  26. cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
  27. cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);
  28. ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);
  29. ierr = PetscGetTime(&v1);CHKERRQ(ierr);
  30. }
  1. #SBATCH -J ex06-k80 # Job name
  2. #SBATCH -p GPU-shared # Queue (RM, RM-shared, GPU, GPU-shared)
  3. #SBATCH -N 1 # Number of nodes
  4. #SBATCH --gres=gpu:k80:1 # GPU type and amount
  5. #SBATCH -t 00:10:00 # Time limit hrs:min:sec
  6. #SBATCH -o ex06-k80-%j.out # Standard output and error log
  7. module use /home/tisaac/opt/modulefiles
  8. module load petsc/cse6230-double
  9. module load cuda
  10. export PGI_ACC_TIME=1
  11. make ex06
  12. git rev-parse HEAD
  13. git diff-files
  14. nvprof ./ex06 -num_steps 1000

Output of Performance Script

  1. ==31674== NVPROF is profiling process 31674, command: ./ex06 -num_steps 1000
  2. Testing 10000 particles in 3 dimensions with steric repulsion:
  3. particle mass: 1.
  4. particle radius: 1.
  5. k_repulsion: 100.
  6. box length: 68.
  7. time step: 0.0001
  8. number of steps: 1000.
  9. Average distance traveled at time 0.: 0.;
  10. Petsctimer: 9.53674e-07
  11. Average distance traveled at time 0.1: 0.775785;
  12. Petsctimer: 62.338
  13. ==31674== Profiling application: ./ex06 -num_steps 1000
  14. ==31674== Profiling result:
  15. Time(%) Time Calls Avg Min Max Name
  16. 99.91% 61.8745s 1000 61.874ms 54.585ms 62.356ms compute_forces(double*, double*)
  17. 0.08% 48.273ms 1000 48.272us 19.872us 74.815us sum_noise_and_forces(double*, double*, curandStateXORWOW*)
  18. 0.01% 7.6320ms 1 7.6320ms 7.6320ms 7.6320ms setup_kernel(curandStateXORWOW*, unsigned long)
  19. 0.00% 1.1592ms 1000 1.1590us 1.0550us 11.616us [CUDA memset]
  20. 0.00% 772.92us 2 386.46us 370.55us 402.36us average_distance(double*, double*)
  21. 0.00% 46.592us 1 46.592us 46.592us 46.592us initialize_points(double*, curandStateXORWOW*)
  22. 0.00% 14.080us 11 1.2800us 1.1840us 1.7280us [CUDA memcpy HtoD]
  23. 0.00% 6.3030us 1 6.3030us 6.3030us 6.3030us [CUDA memcpy DtoD]
  24. 0.00% 5.9520us 2 2.9760us 2.9760us 2.9760us [CUDA memcpy DtoH]
  25. ==31674== API calls:
  26. Time(%) Time Calls Avg Min Max Name
  27. 98.09% 61.9910s 1002 61.867ms 361.17us 66.332ms cudaDeviceSynchronize
  28. 1.35% 851.75ms 8 106.47ms 6.8670us 850.27ms cudaFree
  29. 0.29% 183.48ms 1000 183.48us 10.400us 4.1659ms cudaMemset
  30. 0.22% 139.96ms 2004 69.838us 5.3140us 18.368ms cudaLaunch
  31. 0.02% 11.414ms 10 1.1414ms 5.8890us 7.6609ms cudaMemcpyToSymbol
  32. 0.02% 10.214ms 5008 2.0390us 136ns 1.7300ms cudaSetupArgument
  33. 0.01% 5.3405ms 2004 2.6640us 166ns 151.62us cudaConfigureCall
  34. 0.00% 1.7006ms 178 9.5530us 130ns 351.41us cuDeviceGetAttribute
  35. 0.00% 1.4692ms 7 209.88us 8.9870us 561.43us cudaMalloc
  36. 0.00% 823.24us 1 823.24us 823.24us 823.24us cudaGetDeviceProperties
  37. 0.00% 601.29us 2 300.65us 214.85us 386.44us cuDeviceTotalMem
  38. 0.00% 152.27us 2 76.137us 4.9450us 147.33us cudaThreadSynchronize
  39. 0.00% 145.37us 2 72.686us 71.796us 73.577us cuDeviceGetName
  40. 0.00% 49.641us 2 24.820us 23.784us 25.857us cudaMemcpyFromSymbol
  41. 0.00% 47.484us 2 23.742us 21.850us 25.634us cudaMemcpy
  42. 0.00% 12.186us 16 761ns 456ns 3.5290us cudaEventCreateWithFlags
  43. 0.00% 10.923us 16 682ns 418ns 1.9100us cudaEventDestroy
  44. 0.00% 7.9800us 1 7.9800us 7.9800us 7.9800us cudaSetDeviceFlags
  45. 0.00% 6.0330us 11 548ns 282ns 2.2010us cudaDeviceGetAttribute
  46. 0.00% 3.2600us 4 815ns 250ns 2.1460us cuDeviceGetCount
  47. 0.00% 2.9700us 1 2.9700us 2.9700us 2.9700us cudaGetDevice
  48. 0.00% 1.3480us 4 337ns 217ns 618ns cuDeviceGet
  49. 0.00% 537ns 1 537ns 537ns 537ns cuInit
  50. 0.00% 488ns 1 488ns 488ns 488ns cuDriverGetVersion

Planned changes

We can use share memory to store the location curent praticle. So that it does not need to read from global memory evry time. It somehow dealwith the coalesced memory access problem.

Listing of compute_forces

  1. /* Compute the forces between two particles:
  2. - one thread to update each particle */
  3. __global__ void compute_forces(PetscReal *x, PetscReal *forces)
  4. {
  5. int lid = threadIdx.x;
  6. int id = threadIdx.x + blockIdx.x * blockDim.x;
  7. int gridSize = blockDim.x * gridDim.x;
  8. int i, j, k;
  9. __shared__ double *localx;
  10. if (!lid) {
  11. localx = (double *) malloc(blockDim.x * dim * sizeof(double));
  12. }
  13. __syncthreads();
  14. for (i = id; i < numParticles; i += gridSize) {
  15. for (k = 0; k < dim; k++)
  16. {
  17. localx[k*blockDim.x+lid] = x[dim * i + k];
  18. }
  19. for (j = 0; j < numParticles; j++) {
  20. double dist2 = 0.;
  21. if (i == j) continue;
  22. for (k = 0; k < dim; k++) {
  23. double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);
  24. dist2 += disp * disp;
  25. }
  26. if (dist2 < 4. * a * a) {
  27. double dist = sqrt(dist2);
  28. double f = krepul * (2. - dist);
  29. for (k = 0; k < dim; k++) {
  30. double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);
  31. forces[dim * i + k] += f * disp / dist;
  32. }
  33. }
  34. }
  35. }
  36. if (!lid) {
  37. free(localx);
  38. }
  39. }

Output of Performance Script

We can see the significant improvement, i.e. reduce the time from 62s to 27s.

  1. ==10405== NVPROF is profiling process 10405, command: ./ex06 -num_steps 1000
  2. Testing 10000 particles in 3 dimensions with steric repulsion:
  3. particle mass: 1.
  4. particle radius: 1.
  5. k_repulsion: 100.
  6. box length: 68.
  7. time step: 0.0001
  8. number of steps: 1000.
  9. Average distance traveled at time 0.: 0.;
  10. Petsctimer: 9.53674e-07
  11. Average distance traveled at time 0.1: 0.775785;
  12. Petsctimer: 27.553
  13. ==10405== Profiling application: ./ex06 -num_steps 1000
  14. ==10405== Profiling result:
  15. Time(%) Time Calls Avg Min Max Name
  16. 99.86% 27.4698s 1000 27.470ms 27.355ms 27.578ms compute_forces(double*, double*)
  17. 0.12% 33.281ms 1000 33.280us 22.144us 47.135us sum_noise_and_forces(double*, double*, curandStateXORWOW*)
  18. 0.01% 3.9751ms 1 3.9751ms 3.9751ms 3.9751ms setup_kernel(curandStateXORWOW*, unsigned long)
  19. 0.00% 1.1581ms 1000 1.1580us 1.1200us 1.6320us [CUDA memset]
  20. 0.00% 424.95us 2 212.48us 209.28us 215.68us average_distance(double*, double*)
  21. 0.00% 29.696us 1 29.696us 29.696us 29.696us initialize_points(double*, curandStateXORWOW*)
  22. 0.00% 14.656us 11 1.3320us 1.2160us 1.7600us [CUDA memcpy HtoD]
  23. 0.00% 6.0800us 2 3.0400us 3.0080us 3.0720us [CUDA memcpy DtoH]
  24. 0.00% 3.6800us 1 3.6800us 3.6800us 3.6800us [CUDA memcpy DtoD]
  25. ==10405== API calls:
  26. Time(%) Time Calls Avg Min Max Name
  27. 97.37% 27.5102s 1002 27.455ms 212.97us 27.721ms cudaDeviceSynchronize
  28. 2.36% 667.97ms 8 83.496ms 20.190us 666.12ms cudaFree
  29. 0.14% 40.756ms 2004 20.337us 5.2480us 17.157ms cudaLaunch
  30. 0.07% 18.939ms 1000 18.939us 9.0580us 236.52us cudaMemset
  31. 0.03% 7.9732ms 10 797.32us 6.2620us 3.9868ms cudaMemcpyToSymbol
  32. 0.01% 2.3126ms 178 12.992us 127ns 654.55us cuDeviceGetAttribute
  33. 0.00% 1.2667ms 7 180.95us 8.3140us 334.40us cudaMalloc
  34. 0.00% 1.2197ms 5008 243ns 135ns 10.188us cudaSetupArgument
  35. 0.00% 943.42us 1 943.42us 943.42us 943.42us cudaGetDeviceProperties
  36. 0.00% 830.14us 2004 414ns 166ns 10.570us cudaConfigureCall
  37. 0.00% 478.86us 2 239.43us 220.98us 257.88us cuDeviceTotalMem
  38. 0.00% 183.66us 2 91.828us 90.457us 93.199us cuDeviceGetName
  39. 0.00% 73.310us 2 36.655us 25.746us 47.564us cudaMemcpyFromSymbol
  40. 0.00% 47.385us 2 23.692us 21.845us 25.540us cudaMemcpy
  41. 0.00% 23.497us 16 1.4680us 1.0820us 3.4840us cudaEventDestroy
  42. 0.00% 22.304us 2 11.152us 10.900us 11.404us cudaThreadSynchronize
  43. 0.00% 12.378us 16 773ns 458ns 3.5040us cudaEventCreateWithFlags
  44. 0.00% 7.7670us 1 7.7670us 7.7670us 7.7670us cudaSetDeviceFlags
  45. 0.00% 5.5410us 11 503ns 271ns 1.9720us cudaDeviceGetAttribute
  46. 0.00% 3.3300us 4 832ns 213ns 2.3060us cuDeviceGetCount
  47. 0.00% 2.7160us 1 2.7160us 2.7160us 2.7160us cudaGetDevice
  48. 0.00% 1.4270us 4 356ns 275ns 590ns cuDeviceGet
  49. 0.00% 591ns 1 591ns 591ns 591ns cuDriverGetVersion
  50. 0.00% 572ns 1 572ns 572ns 572ns cuInit
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注