@HaomingJiang
2017-11-22T06:40:56.000000Z
字数 7523
阅读 1298
Haoming Jiang
PetscLogDouble v1,v2,elapsed_time;
v1 = 0;
for (i = 0; i < numSteps; i++) {
if (!(i % 1000)) {
ierr = PetscGetTime(&v2);CHKERRQ(ierr);
elapsed_time = v2-v1;
h_globalAvg = 0.;
cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);
average_distance<<<grid, block>>>(x, xInit);
cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);
ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);
ierr = PetscGetTime(&v1);CHKERRQ(ierr);
}
cerr = cudaMemset(forces, 0, h_dim * h_numParticles * sizeof(PetscReal)); CUDA_CHK(cerr);
compute_forces<<<grid, block>>>(x, forces);
sum_noise_and_forces<<<grid, block>>>(x, forces, randState);
cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
}
if ((i % 1000) != 1) {
ierr = PetscGetTime(&v2);CHKERRQ(ierr);
elapsed_time = v2-v1;
h_globalAvg = 0.;
cerr = cudaMemcpyToSymbol(globalAvg, &h_globalAvg, sizeof(float)); CUDA_CHK(cerr);
average_distance<<<grid, block>>>(x, xInit);
cerr = cudaDeviceSynchronize(); CUDA_CHK(cerr);
cerr = cudaMemcpyFromSymbol(&h_globalAvg, globalAvg, sizeof(float)); CUDA_CHK(cerr);
ierr = PetscPrintf(comm, "Average distance traveled at time %g: %g; Petsctimer: %g\n", i * h_dt, (double) h_globalAvg, elapsed_time); CHKERRQ(ierr);
ierr = PetscGetTime(&v1);CHKERRQ(ierr);
}
#SBATCH -J ex06-k80 # Job name
#SBATCH -p GPU-shared # Queue (RM, RM-shared, GPU, GPU-shared)
#SBATCH -N 1 # Number of nodes
#SBATCH --gres=gpu:k80:1 # GPU type and amount
#SBATCH -t 00:10:00 # Time limit hrs:min:sec
#SBATCH -o ex06-k80-%j.out # Standard output and error log
module use /home/tisaac/opt/modulefiles
module load petsc/cse6230-double
module load cuda
export PGI_ACC_TIME=1
make ex06
git rev-parse HEAD
git diff-files
nvprof ./ex06 -num_steps 1000
==31674== NVPROF is profiling process 31674, command: ./ex06 -num_steps 1000
Testing 10000 particles in 3 dimensions with steric repulsion:
particle mass: 1.
particle radius: 1.
k_repulsion: 100.
box length: 68.
time step: 0.0001
number of steps: 1000.
Average distance traveled at time 0.: 0.;
Petsctimer: 9.53674e-07
Average distance traveled at time 0.1: 0.775785;
Petsctimer: 62.338
==31674== Profiling application: ./ex06 -num_steps 1000
==31674== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.91% 61.8745s 1000 61.874ms 54.585ms 62.356ms compute_forces(double*, double*)
0.08% 48.273ms 1000 48.272us 19.872us 74.815us sum_noise_and_forces(double*, double*, curandStateXORWOW*)
0.01% 7.6320ms 1 7.6320ms 7.6320ms 7.6320ms setup_kernel(curandStateXORWOW*, unsigned long)
0.00% 1.1592ms 1000 1.1590us 1.0550us 11.616us [CUDA memset]
0.00% 772.92us 2 386.46us 370.55us 402.36us average_distance(double*, double*)
0.00% 46.592us 1 46.592us 46.592us 46.592us initialize_points(double*, curandStateXORWOW*)
0.00% 14.080us 11 1.2800us 1.1840us 1.7280us [CUDA memcpy HtoD]
0.00% 6.3030us 1 6.3030us 6.3030us 6.3030us [CUDA memcpy DtoD]
0.00% 5.9520us 2 2.9760us 2.9760us 2.9760us [CUDA memcpy DtoH]
==31674== API calls:
Time(%) Time Calls Avg Min Max Name
98.09% 61.9910s 1002 61.867ms 361.17us 66.332ms cudaDeviceSynchronize
1.35% 851.75ms 8 106.47ms 6.8670us 850.27ms cudaFree
0.29% 183.48ms 1000 183.48us 10.400us 4.1659ms cudaMemset
0.22% 139.96ms 2004 69.838us 5.3140us 18.368ms cudaLaunch
0.02% 11.414ms 10 1.1414ms 5.8890us 7.6609ms cudaMemcpyToSymbol
0.02% 10.214ms 5008 2.0390us 136ns 1.7300ms cudaSetupArgument
0.01% 5.3405ms 2004 2.6640us 166ns 151.62us cudaConfigureCall
0.00% 1.7006ms 178 9.5530us 130ns 351.41us cuDeviceGetAttribute
0.00% 1.4692ms 7 209.88us 8.9870us 561.43us cudaMalloc
0.00% 823.24us 1 823.24us 823.24us 823.24us cudaGetDeviceProperties
0.00% 601.29us 2 300.65us 214.85us 386.44us cuDeviceTotalMem
0.00% 152.27us 2 76.137us 4.9450us 147.33us cudaThreadSynchronize
0.00% 145.37us 2 72.686us 71.796us 73.577us cuDeviceGetName
0.00% 49.641us 2 24.820us 23.784us 25.857us cudaMemcpyFromSymbol
0.00% 47.484us 2 23.742us 21.850us 25.634us cudaMemcpy
0.00% 12.186us 16 761ns 456ns 3.5290us cudaEventCreateWithFlags
0.00% 10.923us 16 682ns 418ns 1.9100us cudaEventDestroy
0.00% 7.9800us 1 7.9800us 7.9800us 7.9800us cudaSetDeviceFlags
0.00% 6.0330us 11 548ns 282ns 2.2010us cudaDeviceGetAttribute
0.00% 3.2600us 4 815ns 250ns 2.1460us cuDeviceGetCount
0.00% 2.9700us 1 2.9700us 2.9700us 2.9700us cudaGetDevice
0.00% 1.3480us 4 337ns 217ns 618ns cuDeviceGet
0.00% 537ns 1 537ns 537ns 537ns cuInit
0.00% 488ns 1 488ns 488ns 488ns cuDriverGetVersion
We can use share memory to store the location curent praticle. So that it does not need to read from global memory evry time. It somehow dealwith the coalesced memory access problem.
/* Compute the forces between two particles:
- one thread to update each particle */
__global__ void compute_forces(PetscReal *x, PetscReal *forces)
{
int lid = threadIdx.x;
int id = threadIdx.x + blockIdx.x * blockDim.x;
int gridSize = blockDim.x * gridDim.x;
int i, j, k;
__shared__ double *localx;
if (!lid) {
localx = (double *) malloc(blockDim.x * dim * sizeof(double));
}
__syncthreads();
for (i = id; i < numParticles; i += gridSize) {
for (k = 0; k < dim; k++)
{
localx[k*blockDim.x+lid] = x[dim * i + k];
}
for (j = 0; j < numParticles; j++) {
double dist2 = 0.;
if (i == j) continue;
for (k = 0; k < dim; k++) {
double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);
dist2 += disp * disp;
}
if (dist2 < 4. * a * a) {
double dist = sqrt(dist2);
double f = krepul * (2. - dist);
for (k = 0; k < dim; k++) {
double disp = remainder(localx[k*blockDim.x+lid] - x[dim * j + k],L);
forces[dim * i + k] += f * disp / dist;
}
}
}
}
if (!lid) {
free(localx);
}
}
We can see the significant improvement, i.e. reduce the time from 62s to 27s.
==10405== NVPROF is profiling process 10405, command: ./ex06 -num_steps 1000
Testing 10000 particles in 3 dimensions with steric repulsion:
particle mass: 1.
particle radius: 1.
k_repulsion: 100.
box length: 68.
time step: 0.0001
number of steps: 1000.
Average distance traveled at time 0.: 0.;
Petsctimer: 9.53674e-07
Average distance traveled at time 0.1: 0.775785;
Petsctimer: 27.553
==10405== Profiling application: ./ex06 -num_steps 1000
==10405== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.86% 27.4698s 1000 27.470ms 27.355ms 27.578ms compute_forces(double*, double*)
0.12% 33.281ms 1000 33.280us 22.144us 47.135us sum_noise_and_forces(double*, double*, curandStateXORWOW*)
0.01% 3.9751ms 1 3.9751ms 3.9751ms 3.9751ms setup_kernel(curandStateXORWOW*, unsigned long)
0.00% 1.1581ms 1000 1.1580us 1.1200us 1.6320us [CUDA memset]
0.00% 424.95us 2 212.48us 209.28us 215.68us average_distance(double*, double*)
0.00% 29.696us 1 29.696us 29.696us 29.696us initialize_points(double*, curandStateXORWOW*)
0.00% 14.656us 11 1.3320us 1.2160us 1.7600us [CUDA memcpy HtoD]
0.00% 6.0800us 2 3.0400us 3.0080us 3.0720us [CUDA memcpy DtoH]
0.00% 3.6800us 1 3.6800us 3.6800us 3.6800us [CUDA memcpy DtoD]
==10405== API calls:
Time(%) Time Calls Avg Min Max Name
97.37% 27.5102s 1002 27.455ms 212.97us 27.721ms cudaDeviceSynchronize
2.36% 667.97ms 8 83.496ms 20.190us 666.12ms cudaFree
0.14% 40.756ms 2004 20.337us 5.2480us 17.157ms cudaLaunch
0.07% 18.939ms 1000 18.939us 9.0580us 236.52us cudaMemset
0.03% 7.9732ms 10 797.32us 6.2620us 3.9868ms cudaMemcpyToSymbol
0.01% 2.3126ms 178 12.992us 127ns 654.55us cuDeviceGetAttribute
0.00% 1.2667ms 7 180.95us 8.3140us 334.40us cudaMalloc
0.00% 1.2197ms 5008 243ns 135ns 10.188us cudaSetupArgument
0.00% 943.42us 1 943.42us 943.42us 943.42us cudaGetDeviceProperties
0.00% 830.14us 2004 414ns 166ns 10.570us cudaConfigureCall
0.00% 478.86us 2 239.43us 220.98us 257.88us cuDeviceTotalMem
0.00% 183.66us 2 91.828us 90.457us 93.199us cuDeviceGetName
0.00% 73.310us 2 36.655us 25.746us 47.564us cudaMemcpyFromSymbol
0.00% 47.385us 2 23.692us 21.845us 25.540us cudaMemcpy
0.00% 23.497us 16 1.4680us 1.0820us 3.4840us cudaEventDestroy
0.00% 22.304us 2 11.152us 10.900us 11.404us cudaThreadSynchronize
0.00% 12.378us 16 773ns 458ns 3.5040us cudaEventCreateWithFlags
0.00% 7.7670us 1 7.7670us 7.7670us 7.7670us cudaSetDeviceFlags
0.00% 5.5410us 11 503ns 271ns 1.9720us cudaDeviceGetAttribute
0.00% 3.3300us 4 832ns 213ns 2.3060us cuDeviceGetCount
0.00% 2.7160us 1 2.7160us 2.7160us 2.7160us cudaGetDevice
0.00% 1.4270us 4 356ns 275ns 590ns cuDeviceGet
0.00% 591ns 1 591ns 591ns 591ns cuDriverGetVersion
0.00% 572ns 1 572ns 572ns 572ns cuInit