@codejan
2017-12-08T01:57:55.000000Z
字数 3998
阅读 899
计算机
程序计时方法
另外, time .\a.out 可以精确计算a.out的运行时间
-fopenmp#pragma omp parallel
在下面的花括号(必须换行)中写入用于并行的代码, 需要注意:
- 各个线程执行顺序随机, 执行速度不完全相等
- 利用 omp_get_thread_num() 可以得到每个进程的编号, 然后根据编号可以对不同的进程进行不同的操作
#include <omp.h>#include <stdio.h>int main(){#pragma omp parallel{int ID = omp_get_thread_num();printf("hello(%d) ", ID);// int tmp;// for(int i = 1; i <= 100000; i++) tmp += i*i;printf("world(%d)\n", ID);}}
#pragma omp critial 用来指定并行代码中的临界区, 每次只能有一个线程对临界区执行操作.
比如下面计算pi的代码中, 如果不对area += 4/(1 + x*x);添加临界区, 当同时有两个线程加area时, 很容易丢掉一个
//tjh 2017.12.6//calculate Pi by numerical integration//use #pragma omp critical to avoid wrong answer when parallel#include <omp.h>#include <stdio.h>int main(){double area, pi, x;int i, n;area = 0;n = 1000000;#pragma omp parallel forfor(i = 0; i < n; i++){x = (i+0.5) / n;#pragma omp criticalarea += 4/(1 + x*x);}pi = area / n;printf("%.10f", pi);}
#include <stdio.h>#include <time.h>#define N 50000001int prime[N/10], isnot_prime[N];int main(){clock_t start, end;start = clock();int cnt = 0;for(int i = 2; i < N; i++){if(isnot_prime[i] == 0){prime[cnt++] = i;for(int j = i+i; j < N; j += i){isnot_prime[j] = 1;}}}end = clock();printf("%d, time = %f\n", cnt, (double)(end-start)/1000000);return 0;}
#pragma omp parallel for对for循环并行
//tjh 2017.12.3//find prime : NlogN//OpenMP version#include <stdio.h>#include <omp.h>#define N 50000001int prime[N/10], isnot_prime[N];int main(){int cnt = 0;// #pragma omp parallel for //wrong: 前后关系for(int i = 2; i*i < N; i++){if(isnot_prime[i] == 0){prime[cnt++] = i;#pragma omp parallel forfor(int j = i+i; j < N; j+=i){isnot_prime[j] = 1;}isnot_prime[i] = 0;}}cnt = 0;for(int i = 2; i < N; i++){if(isnot_prime[i] == 0) cnt++;}return 0;}
//2017.12.6//matrix multiplicaiton#include <omp.h>#include <stdio.h>int a[5005][5005], b[5005][5005], c[5005][5005];int main(){freopen("data.txt", "r", stdin);int n;scanf("%d", &n);//read files + openmp : very slowfor(int i = 0; i < n; i++){for(int j = 0; j < n; j++){scanf("%d", &a[i][j]);}}for(int i = 0; i < n; i++){for(int j = 0; j < n; j++){scanf("%d", &b[i][j]);c[i][j] = 0;}}#pragma omp parallel for //only parallel for i, 因此不需要临界区for(int i = 0; i < n; i++){for(int j = 0; j < n; j++){for(int k = 0; k < n; k++){c[i][j] += a[i][k] * b[k][j];}}}int ans = 0;for(int i = 0; i < n; i++) ans += c[i][i];printf("%d", ans);}
编译mpicc name.c -o name
运行mpirun -np x name -np标记要生成进程个数
//tjh 2017.12.6//mpi: check circuit#include <mpi.h>#include <stdio.h>int check_circuit(int id, int z){int v[16];int i;for(i = 0; i < 16; i++){if( z&(1<<i) ) v[i] = 1;else v[i] = 0; //remember to initialize}// for(i = 0; i < 16; i++) printf("%d", v[i]);// printf("\n");if( (v[0] || v[1]) && (!v[1] || !v[3]) && (v[2] || v[3])&& (!v[3] || !v[4]) && (v[4] || !v[5]) && (v[5] || !v[6])&& (v[5] || v[6]) && (v[6] || !v[15]) && (v[7] || !v[8])&& (!v[7] || !v[13]) && (v[8] || v[9])&& (v[8] || !v[9]) && (!v[9] || !v[10])&& (v[9] || v[11]) && (v[10] || v[11])&& (v[12] || v[13]) && (v[13] || !v[14])&& (v[14] || v[15]) ){printf("%d)", id);for(i = 0; i < 16; i++) printf("%d", v[i]);printf("\n");return 1;}return 0;}int main(int argc, char *argv[]){int k, id, p;int global_ans = 0;int ans = 0;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &id);MPI_Comm_size(MPI_COMM_WORLD, &p);ans = 0;for(k = id; k < 65536; k += p) ans += check_circuit(id, k);MPI_Reduce(&ans, &global_ans, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);if(id==0) printf("There are %d different solutions.", global_ans);printf("Process %d is done\n", id);fflush(stdout);MPI_Finalize();return 0;}
#include <iostream>#include <math.h>void add(int n, float *x, float *y){for(int i = 0; i < n; i++)y[i] = x[i] + y[i];}int main(void){int N = 1 << 20;float *x = new float[N];float *y = new float[N];for(int i = 0; i < N; i++){x[i] = 1.0f;y[i] = 2.0f;}add(N, x, y);delete [] x;delete [] y;return 0;}
#include <iostream>#include <math.h>__global__void add(int n, float *x, float *y){for(int i = 0; i < n; i++)y[i] = x[i] + y[i];}int main(void){int N = 1 << 20;float *x, float *y;cudaMallocManaged(&x, N*sizeof(float));cudaMallocManaged(&y, N*sizeof(float));for(int i = 0; i < N; i++){x[i] = 1.0f;y[i] = 2.0f;}add<<<1,1>>>(N, x, y);cudaDeviceSynchronize();cudaFree(x);cudaFree(y);return 0;}