@codejan
2017-12-08T01:57:55.000000Z
字数 3998
阅读 807
计算机
程序计时方法
另外, time .\a.out
可以精确计算a.out的运行时间
-fopenmp
#pragma omp parallel
在下面的花括号(必须换行)中写入用于并行的代码, 需要注意:
- 各个线程执行顺序随机, 执行速度不完全相等
- 利用 omp_get_thread_num()
可以得到每个进程的编号, 然后根据编号可以对不同的进程进行不同的操作
#include <omp.h>
#include <stdio.h>
int main(){
#pragma omp parallel
{
int ID = omp_get_thread_num();
printf("hello(%d) ", ID);
// int tmp;
// for(int i = 1; i <= 100000; i++) tmp += i*i;
printf("world(%d)\n", ID);
}
}
#pragma omp critial
用来指定并行代码中的临界区, 每次只能有一个线程对临界区执行操作.
比如下面计算pi的代码中, 如果不对area += 4/(1 + x*x);
添加临界区, 当同时有两个线程加area
时, 很容易丢掉一个
//tjh 2017.12.6
//calculate Pi by numerical integration
//use #pragma omp critical to avoid wrong answer when parallel
#include <omp.h>
#include <stdio.h>
int main(){
double area, pi, x;
int i, n;
area = 0;
n = 1000000;
#pragma omp parallel for
for(i = 0; i < n; i++){
x = (i+0.5) / n;
#pragma omp critical
area += 4/(1 + x*x);
}
pi = area / n;
printf("%.10f", pi);
}
#include <stdio.h>
#include <time.h>
#define N 50000001
int prime[N/10], isnot_prime[N];
int main(){
clock_t start, end;
start = clock();
int cnt = 0;
for(int i = 2; i < N; i++){
if(isnot_prime[i] == 0){
prime[cnt++] = i;
for(int j = i+i; j < N; j += i){
isnot_prime[j] = 1;
}
}
}
end = clock();
printf("%d, time = %f\n", cnt, (double)(end-start)/1000000);
return 0;
}
#pragma omp parallel for
对for循环并行
//tjh 2017.12.3
//find prime : NlogN
//OpenMP version
#include <stdio.h>
#include <omp.h>
#define N 50000001
int prime[N/10], isnot_prime[N];
int main(){
int cnt = 0;
// #pragma omp parallel for //wrong: 前后关系
for(int i = 2; i*i < N; i++){
if(isnot_prime[i] == 0){
prime[cnt++] = i;
#pragma omp parallel for
for(int j = i+i; j < N; j+=i){
isnot_prime[j] = 1;
}
isnot_prime[i] = 0;
}
}
cnt = 0;
for(int i = 2; i < N; i++){
if(isnot_prime[i] == 0) cnt++;
}
return 0;
}
//2017.12.6
//matrix multiplicaiton
#include <omp.h>
#include <stdio.h>
int a[5005][5005], b[5005][5005], c[5005][5005];
int main(){
freopen("data.txt", "r", stdin);
int n;
scanf("%d", &n);
//read files + openmp : very slow
for(int i = 0; i < n; i++){
for(int j = 0; j < n; j++){
scanf("%d", &a[i][j]);
}
}
for(int i = 0; i < n; i++){
for(int j = 0; j < n; j++){
scanf("%d", &b[i][j]);
c[i][j] = 0;
}
}
#pragma omp parallel for //only parallel for i, 因此不需要临界区
for(int i = 0; i < n; i++){
for(int j = 0; j < n; j++){
for(int k = 0; k < n; k++){
c[i][j] += a[i][k] * b[k][j];
}
}
}
int ans = 0;
for(int i = 0; i < n; i++) ans += c[i][i];
printf("%d", ans);
}
编译mpicc name.c -o name
运行mpirun -np x name
-np标记要生成进程个数
//tjh 2017.12.6
//mpi: check circuit
#include <mpi.h>
#include <stdio.h>
int check_circuit(int id, int z){
int v[16];
int i;
for(i = 0; i < 16; i++){
if( z&(1<<i) ) v[i] = 1;
else v[i] = 0; //remember to initialize
}
// for(i = 0; i < 16; i++) printf("%d", v[i]);
// printf("\n");
if( (v[0] || v[1]) && (!v[1] || !v[3]) && (v[2] || v[3])
&& (!v[3] || !v[4]) && (v[4] || !v[5]) && (v[5] || !v[6])
&& (v[5] || v[6]) && (v[6] || !v[15]) && (v[7] || !v[8])
&& (!v[7] || !v[13]) && (v[8] || v[9])
&& (v[8] || !v[9]) && (!v[9] || !v[10])
&& (v[9] || v[11]) && (v[10] || v[11])
&& (v[12] || v[13]) && (v[13] || !v[14])
&& (v[14] || v[15]) ){
printf("%d)", id);
for(i = 0; i < 16; i++) printf("%d", v[i]);
printf("\n");
return 1;
}
return 0;
}
int main(int argc, char *argv[]){
int k, id, p;
int global_ans = 0;
int ans = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
ans = 0;
for(k = id; k < 65536; k += p) ans += check_circuit(id, k);
MPI_Reduce(&ans, &global_ans, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if(id==0) printf("There are %d different solutions.", global_ans);
printf("Process %d is done\n", id);
fflush(stdout);
MPI_Finalize();
return 0;
}
#include <iostream>
#include <math.h>
void add(int n, float *x, float *y){
for(int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void){
int N = 1 << 20;
float *x = new float[N];
float *y = new float[N];
for(int i = 0; i < N; i++){
x[i] = 1.0f;
y[i] = 2.0f;
}
add(N, x, y);
delete [] x;
delete [] y;
return 0;
}
#include <iostream>
#include <math.h>
__global__
void add(int n, float *x, float *y){
for(int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void){
int N = 1 << 20;
float *x, float *y;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
for(int i = 0; i < N; i++){
x[i] = 1.0f;
y[i] = 2.0f;
}
add<<<1,1>>>(N, x, y);
cudaDeviceSynchronize();
cudaFree(x);
cudaFree(y);
return 0;
}