@HaomingJiang 2017-11-21T22:40:56.000000Z 字数 7523 阅读 1459

CSE6230 Exercise 06

Haoming Jiang

Performance Assessment Script

  PetscLogDouble v1,v2,elapsed_time;
  v1 = 0;
  for (i = 0; i < numSteps; i++) {
    if (!(i % 1000)) {
      ierr = PetscGetTime(&v2);CHKERRQ(ierr);
      elapsed_time = v2-v1;
      h_globalAvg = 0.;
      cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);
      average_distance<<<grid, block>>>(x, xInit);
      cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
      cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);
      ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);
      ierr = PetscGetTime(&v1);CHKERRQ(ierr);
    }
    cerr = cudaMemset(forces, 0, h_dim * h_numParticles * sizeof(PetscReal)); CUDA_CHK(cerr);
    compute_forces<<<grid, block>>>(x, forces);
    sum_noise_and_forces<<<grid, block>>>(x, forces, randState);
    cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
  }
  if ((i % 1000) != 1) {
    ierr = PetscGetTime(&v2);CHKERRQ(ierr);
    elapsed_time = v2-v1;
    h_globalAvg = 0.;
    cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);
    average_distance<<<grid, block>>>(x, xInit);
    cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
    cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);
    ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);
    ierr = PetscGetTime(&v1);CHKERRQ(ierr);
  }

#SBATCH  -J ex06-k80                     # Job name
#SBATCH  -p GPU-shared                   # Queue (RM, RM-shared, GPU, GPU-shared)
#SBATCH  -N 1                            # Number of nodes
#SBATCH --gres=gpu:k80:1                 # GPU type and amount
#SBATCH  -t 00:10:00                     # Time limit hrs:min:sec
#SBATCH  -o ex06-k80-%j.out              # Standard output and error log
module use /home/tisaac/opt/modulefiles
module load petsc/cse6230-double
module load cuda
export PGI_ACC_TIME=1
make ex06
git rev-parse HEAD
git diff-files
nvprof ./ex06 -num_steps 1000

Output of Performance Script

==31674== NVPROF is profiling process 31674, command: ./ex06 -num_steps 1000
Testing 10000 particles in 3 dimensions with steric repulsion:
  particle mass:   1.
  particle radius: 1.
  k_repulsion:     100.
  box length:      68.
  time step:       0.0001
  number of steps: 1000.
Average distance traveled at time 0.: 0.;
 Petsctimer: 9.53674e-07
Average distance traveled at time 0.1: 0.775785;
 Petsctimer: 62.338
==31674== Profiling application: ./ex06 -num_steps 1000
==31674== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 99.91%  61.8745s      1000  61.874ms  54.585ms  62.356ms  compute_forces(double*, double*)
  0.08%  48.273ms      1000  48.272us  19.872us  74.815us  sum_noise_and_forces(double*, double*, curandStateXORWOW*)
  0.01%  7.6320ms         1  7.6320ms  7.6320ms  7.6320ms  setup_kernel(curandStateXORWOW*, unsigned long)
  0.00%  1.1592ms      1000  1.1590us  1.0550us  11.616us  [CUDA memset]
  0.00%  772.92us         2  386.46us  370.55us  402.36us  average_distance(double*, double*)
  0.00%  46.592us         1  46.592us  46.592us  46.592us  initialize_points(double*, curandStateXORWOW*)
  0.00%  14.080us        11  1.2800us  1.1840us  1.7280us  [CUDA memcpy HtoD]
  0.00%  6.3030us         1  6.3030us  6.3030us  6.3030us  [CUDA memcpy DtoD]
  0.00%  5.9520us         2  2.9760us  2.9760us  2.9760us  [CUDA memcpy DtoH]
==31674== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 98.09%  61.9910s      1002  61.867ms  361.17us  66.332ms  cudaDeviceSynchronize
  1.35%  851.75ms         8  106.47ms  6.8670us  850.27ms  cudaFree
  0.29%  183.48ms      1000  183.48us  10.400us  4.1659ms  cudaMemset
  0.22%  139.96ms      2004  69.838us  5.3140us  18.368ms  cudaLaunch
  0.02%  11.414ms        10  1.1414ms  5.8890us  7.6609ms  cudaMemcpyToSymbol
  0.02%  10.214ms      5008  2.0390us     136ns  1.7300ms  cudaSetupArgument
  0.01%  5.3405ms      2004  2.6640us     166ns  151.62us  cudaConfigureCall
  0.00%  1.7006ms       178  9.5530us     130ns  351.41us  cuDeviceGetAttribute
  0.00%  1.4692ms         7  209.88us  8.9870us  561.43us  cudaMalloc
  0.00%  823.24us         1  823.24us  823.24us  823.24us  cudaGetDeviceProperties
  0.00%  601.29us         2  300.65us  214.85us  386.44us  cuDeviceTotalMem
  0.00%  152.27us         2  76.137us  4.9450us  147.33us  cudaThreadSynchronize
  0.00%  145.37us         2  72.686us  71.796us  73.577us  cuDeviceGetName
  0.00%  49.641us         2  24.820us  23.784us  25.857us  cudaMemcpyFromSymbol
  0.00%  47.484us         2  23.742us  21.850us  25.634us  cudaMemcpy
  0.00%  12.186us        16     761ns     456ns  3.5290us  cudaEventCreateWithFlags
  0.00%  10.923us        16     682ns     418ns  1.9100us  cudaEventDestroy
  0.00%  7.9800us         1  7.9800us  7.9800us  7.9800us  cudaSetDeviceFlags
  0.00%  6.0330us        11     548ns     282ns  2.2010us  cudaDeviceGetAttribute
  0.00%  3.2600us         4     815ns     250ns  2.1460us  cuDeviceGetCount
  0.00%  2.9700us         1  2.9700us  2.9700us  2.9700us  cudaGetDevice
  0.00%  1.3480us         4     337ns     217ns     618ns  cuDeviceGet
  0.00%     537ns         1     537ns     537ns     537ns  cuInit
  0.00%     488ns         1     488ns     488ns     488ns  cuDriverGetVersion

Planned changes

We can use share memory to store the location curent praticle. So that it does not need to read from global memory evry time. It somehow dealwith the coalesced memory access problem.

Listing of compute_forces


/* Compute the forces between two particles:
   - one thread to update each particle */
__global__ void compute_forces(PetscReal *x, PetscReal *forces)
{
  int lid           = threadIdx.x;
  int id            = threadIdx.x + blockIdx.x * blockDim.x;
  int gridSize      = blockDim.x * gridDim.x;
  int i, j, k;
  __shared__ double *localx;
  if (!lid) {
    localx = (double *) malloc(blockDim.x * dim * sizeof(double));
  }
  __syncthreads();
  for (i = id; i < numParticles; i += gridSize) {
    for (k = 0; k < dim; k++)
    {
      localx[k*blockDim.x+lid] = x[dim * i + k];
    }
    for (j = 0; j < numParticles; j++) {
      double dist2 = 0.;
      if (i == j) continue;
      for (k = 0; k < dim; k++) {
        double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);
        dist2 += disp * disp;
      }
      if (dist2 < 4. * a * a) {
        double dist = sqrt(dist2);
        double f = krepul * (2. - dist);
        for (k = 0; k < dim; k++) {
          double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);
          forces[dim * i + k] += f * disp / dist;
        }
      }
    }
  }
  if (!lid) {
    free(localx);
  }
}

Output of Performance Script

We can see the significant improvement, i.e. reduce the time from 62s to 27s.

==10405== NVPROF is profiling process 10405, command: ./ex06 -num_steps 1000
Testing 10000 particles in 3 dimensions with steric repulsion:
  particle mass:   1.
  particle radius: 1.
  k_repulsion:     100.
  box length:      68.
  time step:       0.0001
  number of steps: 1000.
Average distance traveled at time 0.: 0.;
 Petsctimer: 9.53674e-07
Average distance traveled at time 0.1: 0.775785;
 Petsctimer: 27.553
==10405== Profiling application: ./ex06 -num_steps 1000
==10405== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 99.86%  27.4698s      1000  27.470ms  27.355ms  27.578ms  compute_forces(double*, double*)
  0.12%  33.281ms      1000  33.280us  22.144us  47.135us  sum_noise_and_forces(double*, double*, curandStateXORWOW*)
  0.01%  3.9751ms         1  3.9751ms  3.9751ms  3.9751ms  setup_kernel(curandStateXORWOW*, unsigned long)
  0.00%  1.1581ms      1000  1.1580us  1.1200us  1.6320us  [CUDA memset]
  0.00%  424.95us         2  212.48us  209.28us  215.68us  average_distance(double*, double*)
  0.00%  29.696us         1  29.696us  29.696us  29.696us  initialize_points(double*, curandStateXORWOW*)
  0.00%  14.656us        11  1.3320us  1.2160us  1.7600us  [CUDA memcpy HtoD]
  0.00%  6.0800us         2  3.0400us  3.0080us  3.0720us  [CUDA memcpy DtoH]
  0.00%  3.6800us         1  3.6800us  3.6800us  3.6800us  [CUDA memcpy DtoD]
==10405== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 97.37%  27.5102s      1002  27.455ms  212.97us  27.721ms  cudaDeviceSynchronize
  2.36%  667.97ms         8  83.496ms  20.190us  666.12ms  cudaFree
  0.14%  40.756ms      2004  20.337us  5.2480us  17.157ms  cudaLaunch
  0.07%  18.939ms      1000  18.939us  9.0580us  236.52us  cudaMemset
  0.03%  7.9732ms        10  797.32us  6.2620us  3.9868ms  cudaMemcpyToSymbol
  0.01%  2.3126ms       178  12.992us     127ns  654.55us  cuDeviceGetAttribute
  0.00%  1.2667ms         7  180.95us  8.3140us  334.40us  cudaMalloc
  0.00%  1.2197ms      5008     243ns     135ns  10.188us  cudaSetupArgument
  0.00%  943.42us         1  943.42us  943.42us  943.42us  cudaGetDeviceProperties
  0.00%  830.14us      2004     414ns     166ns  10.570us  cudaConfigureCall
  0.00%  478.86us         2  239.43us  220.98us  257.88us  cuDeviceTotalMem
  0.00%  183.66us         2  91.828us  90.457us  93.199us  cuDeviceGetName
  0.00%  73.310us         2  36.655us  25.746us  47.564us  cudaMemcpyFromSymbol
  0.00%  47.385us         2  23.692us  21.845us  25.540us  cudaMemcpy
  0.00%  23.497us        16  1.4680us  1.0820us  3.4840us  cudaEventDestroy
  0.00%  22.304us         2  11.152us  10.900us  11.404us  cudaThreadSynchronize
  0.00%  12.378us        16     773ns     458ns  3.5040us  cudaEventCreateWithFlags
  0.00%  7.7670us         1  7.7670us  7.7670us  7.7670us  cudaSetDeviceFlags
  0.00%  5.5410us        11     503ns     271ns  1.9720us  cudaDeviceGetAttribute
  0.00%  3.3300us         4     832ns     213ns  2.3060us  cuDeviceGetCount
  0.00%  2.7160us         1  2.7160us  2.7160us  2.7160us  cudaGetDevice
  0.00%  1.4270us         4     356ns     275ns     590ns  cuDeviceGet
  0.00%     591ns         1     591ns     591ns     591ns  cuDriverGetVersion
  0.00%     572ns         1     572ns     572ns     572ns  cuInit

CSE6230 Exercise 06

Performance Assessment Script

Output of Performance Script

Planned changes

Listing of compute_forces

Output of Performance Script

内容目录