前言
最近在写一个tsp回溯算法的pthread并行算法。
进行的实验是14个城市,给定它们在平面坐标系上的坐标,彼此的距离即为二者的欧几里得距离。求经过这14个城市的最短汉密尔顿回路。
并行程序使用了6个线程,串行程序即普通单CPU程序。
算法使用的是使用栈模拟递归的回溯法,这种方法的时间复杂度是O(n!)。还有一种动态规划算法,时间复杂度是O(2^n * n ),因为它消除了大量的子问题,比回溯法快的多。但回溯法易于并行实现。
设并行的核数是m,那么每个线程的时间复杂度是O(n!/ m)。
此外,为了代码书写简单,同时避免malloc的开销,串行程序用的是数组,并行程序用的是vector,(问题所在)。
理论上的实验结果应该是并行程序比串行程序快将近6倍,但得到的结果反而是并行程序比串行程序慢了4、5倍(时钟周期数)。
百思不得其解。
怀疑
第一个怀疑是,是不是同一个程序在创建的线程中的执行速度比主线程要慢。最终排除了。
第二个怀疑是,是不是linux的时间机制有问题。当时使用的是:
clock_t start = clock();
cout << "执行时间: " << clock() - start << endl;
后来改用
#include <sys/time.h>
long computeInterval(const struct timeval& start, const struct timeval& mid)
{
long start_ms = start.tv_sec * 1000 + start.tv_usec / 1000;
long mid_ms = mid.tv_sec * 1000 + mid.tv_usec / 1000;
return mid_ms - start_ms;
}
gettimeofday(strat);
gettimeofday(mid);
cout << "执行时间: " << computeInterval(start, mid) << endl;
这个猜测是合理的。 clock() 函数计算的是这个进程执行占用CPU时间的时钟周期数,而我们这里有6个线程,clock() 函数就包含了所有的线程执行时间以及上下文切换的时间,时钟周期数肯定会比串行要多。
改用第二种方法后可以计算时间的程序执行开始与执行结束之间的时间。此时并行程序与串行程序的执行时间近似相同。
第三个怀疑
vector与普通数组的速度差异。
进行测试:使用vector与普通数组进行同样的scale次操作,其中9/10是尾插法插入操作,另1/10是弹出尾部元素。
不断调整scale,观测二者的执行速度。
#include <iostream>
#include <algorithm>
#include <pthread.h>
#include <cmath>
#include <vector>
#include <sys/time.h>
#include <cstdlib>
using namespace std;
struct Node_t{
int s, u;
double cost;
Node_t() {}
Node_t(int ss, int uu, double cc) { s = ss, u = uu, cost = cc; }
};
long computeInterval(const struct timeval& start, const struct timeval& mid)
{
long start_ms = start.tv_sec * 1000 + start.tv_usec / 1000;
long mid_ms = mid.tv_sec * 1000 + mid.tv_usec / 1000;
return mid_ms - start_ms;
}
const int maxn = 1e8;
int top = -1;
Node_t stk[maxn];
void myThread1(long threhold)
{
for (int i = 0; i < threhold; i++)
{
stk[++top] = {0,0,0};
if (i % 10)
top--;
}
}
vector<Node_t> vec;
void myThread(long threhold)
{
for (int i = 0; i < threhold; i++)
{
vec.push_back({0, 0, 0});
if (i % 10)
vec.pop_back();
}
}
int main(int argc, char* argv[])
{
int scale = pow(10, atoi(argv[1]));
cout << "data scale is " << atoi(argv[1]) << " power of 10" << endl;
timeval begin, mid, last;
gettimeofday(&begin, NULL);
myThread1(scale);
gettimeofday(&mid, NULL);
myThread(scale);
gettimeofday(&last, NULL);
cout << computeInterval(begin, mid) << endl;
cout << computeInterval(mid, last) << endl;
return 0;
}
执行结果如下(上面为串行时间,下面为并行时间,时间单位是ms):
当数据量规模在1e6往上时,vector操作时间大概是数组的4-6倍。
原因应该出在vector的内存机制上,当存储数多于vector预存的内存单元数后,vector会申请一块更大的内存空间(当前预存数的2倍),并将现在的数据拷贝到新的空间中。因此,数据量很大时,vector的操作中有很大量的内存copy,严重降低了效率。
代码
#include <iostream>
#include <algorithm>
#include <pthread.h>
#include <cmath>
#include <vector>
#include <ctime>
#include <sys/time.h>
#include <semaphore.h>
#include <cstring>
#include <cstdio>
using namespace std;
const int comm_sz = 4;
//experimental data, the max size needs for 14 cities is 91
const int arr_size = 105;
const int city_num = 14;
const int inf = 0x3f3f3f3f;
double x[city_num], y[city_num];
double dist[city_num][city_num];
const int totalS = (1 << city_num) - 1;
double ans = inf;
struct Node_t{
int s, u;
double cost;
Node_t() {}
Node_t(int ss, int uu, double cc) { s = ss, u = uu, cost = cc; }
};
vector<Node_t> stk;
Node_t newStk[arr_size];
int newTop, newBottom;
pthread_mutex_t mutex_ans;
pthread_mutex_t mutex_split;
pthread_cond_t cond_split;
int threads_in_wait = 0;
long computeInterval(const struct timeval& start, const struct timeval& end)
{
long start_ms = start.tv_sec * 1000 + start.tv_usec / 1000;
long end_ms = end.tv_sec * 1000 + end.tv_usec / 1000;
return end_ms - start_ms;
}
void update_best(double cost)
{
//critical zone(change ans)
pthread_mutex_lock(&mutex_ans);
ans = min(ans, cost);
pthread_mutex_unlock(&mutex_ans);
}
//thread my_rank is in charge for the myStk defined by the top and the bottom
void carryTask(Node_t* myStk, int top, int bottom, int my_rank)
{
//stack simulation dfs
while (top != bottom)
{
Node_t node = myStk[top--];
//cut
if (node.cost >= ans)
continue;
//reach the boundary and update ans
if (node.s == totalS)
{
if (node.u == 0 && node.cost < ans)
update_best(node.cost);
continue;
}
for (int v = 0; v < city_num; v++)
if (!(node.s & (1 << v)))
myStk[++top] = {node.s | (1 << v), v, node.cost + dist[node.u][v]};
//newTop == -1 signal that the newStk is available
if (threads_in_wait != 0 && top - bottom >= 4 && newTop == -1)
{
pthread_mutex_lock(&mutex_split);
if (threads_in_wait > 0 && newTop == -1)
{
threads_in_wait--;
//bottom 1/4 data will be sent to other threads
unsigned tasks_num = ((top - bottom) / 4);
memcpy(newStk, myStk + bottom + 1, tasks_num * sizeof(Node_t));
//printf("thread %d split tasks of number %u\n", my_rank, tasks_num);
newTop = tasks_num - 1;
newBottom = -1;
bottom = newTop;
pthread_cond_signal(&cond_split);
pthread_mutex_unlock(&mutex_split);
}
else
pthread_mutex_unlock(&mutex_split);
}
}
}
void* myThread(void* arg)
{
long my_rank = (long)arg;
//malloc memory rather than use vector
Node_t* myStk = new Node_t[arr_size];
int top = -1, bottom = -1;
//tasks distribute
for (int i = 0; i < stk.size(); i++)
if (i % comm_sz == my_rank)
myStk[++top] = stk[i];
//do tasks of itself
carryTask(myStk, top, bottom, my_rank);
//most threads will sleep, so the while loop effect the efficiency little
while (1)
{
pthread_mutex_lock(&mutex_split);
threads_in_wait++;
if (threads_in_wait == comm_sz)
{
//cout << "last one is " << my_rank << endl;
pthread_cond_broadcast(&cond_split);
pthread_mutex_unlock(&mutex_split);
break;
}
else
{
//cout << "thread " << my_rank <<" finish the job" << endl;
while (pthread_cond_wait(&cond_split, &mutex_split) != 0);
if (threads_in_wait == comm_sz)
{
pthread_mutex_unlock(&mutex_split);
break;
}
else
{
unsigned tasks_num = newTop - newBottom;
top = newTop;
bottom = newBottom;
memcpy(myStk, newStk, tasks_num * sizeof(Node_t));
//printf("thread %d split tasks of number %u\n", my_rank, tasks_num);
newTop = -1;
pthread_mutex_unlock(&mutex_split);
carryTask(myStk, top, bottom, my_rank);
}
}
}
return NULL;
}
int main()
{
//get the input
freopen("in.txt", "r", stdin);
for (int i = 0; i < city_num; i++)
cin >> x[i];
for (int i = 0; i < city_num; i++)
cin >> y[i];
//compute the distance bewteen two cities
for (int i = 0; i < city_num; i++)
{
dist[i][i] = 0;
for (int j = i + 1; j < city_num; j++)
{
dist[i][j] = dist[j][i] = sqrt(pow(x[i] - x[j], 2) + pow(y[i] - y[j], 2));
}
}
//use function gettimeofday to compute diff-time
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
//city 0 run bfs, get (citynum - 1) tasks and these tasks will be distributed by threads
for (int v = 1; v < city_num; v++)
stk.push_back({(1 << v), v, dist[0][v]});
//compute a random ans, it will help cut
ans = dist[city_num - 1][0];
for (int v = 0; v < city_num - 1; v++)
ans += dist[v][v + 1];
//do some initialization
pthread_mutex_init(&mutex_ans, NULL);
pthread_mutex_init(&mutex_split, NULL);
pthread_cond_init(&cond_split, NULL);
threads_in_wait = 0;
newTop = -1;
//create threads and wait for them
pthread_t P[comm_sz];
for (long i = 0; i < comm_sz; i++)
pthread_create(&P[i], NULL, myThread, (void*)i);
for (long i = 0; i < comm_sz; i++)
pthread_join(P[i], NULL);
//compute diff-time
gettimeofday(&end, NULL);
printf("answer is %.2lf\n", ans);
printf("total %ldms\n", computeInterval(start, end));
return 0;
}
/*
1304 3639 4177 3712 3488 3326 3238 4196 4312 4386 3007 2562 2788 2381 1332
2312 1315 2244 1399 1535 1556 1229 1004 790 570 1970 1756 1491 1676 695
*/