实验代码思想来源于论文(PRAM和LARPBS模型上的近似串匹配并行算法 by 钟 诚+陈国良),就是将上一篇文章中所叙述的串行代码并行化,利用了生产者与消费者和读写者模型的结合,0号进程执行生产者(写者)方法,其余进程执行消费者(读者)方法,生产者和其余消费者轮流执行,当消费者执行时,生产者等待,当生产者执行时,消费者等待。这里不会造成饥饿问题,因为通过一个一维数组信号量控制消费者只能执行一次,执行完成之后等待,等所有消费者执行完成一次之后,生产者开始执行,执行完成之后,释放所有的消费者,这样形成循环。
代码如下:
#include <iostream>
#include <algorithm>
#include <string>
#include <sstream>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <semaphore.h>
sem_t mutex, wrt, *pth, empty;
template <typename T>
inline auto
min(const T &a, const T &b, const T &c) -> const T &
{
return std::min(std::min(a, b), c);
}
//生产者结构体
struct thread_work_t
{
int *L, *PL, *PP; //三条斜线
int *buffer_size; //需要计算的一条斜边的需要计算的单元数
int *index; //斜边的序列号
int m, n; //行数,列数
int cpus; //cpu总量数量
};
//生产者方法
void *producer(void *thread_work_uncasted)
{
struct thread_work_t *thread_work = (struct thread_work_t *)thread_work_uncasted;
int m = thread_work->m; //行数
int n = thread_work->n; //列数
int *L = thread_work->L;
int *PL = thread_work->PL;
int *PP = thread_work->PP;
int cpus = thread_work->cpus;
//int & buffer_size_cur = thread_work->buffer_size_cur;
int *buffer_size = thread_work->buffer_size; //计算斜边的单元数
int *index = thread_work->index; //斜边的序列
while ((*index) < m + n)
{
sem_wait(&empty); //控制生产者和消费者轮流执行
sem_wait(&wrt); //控制生产者在处理数据的时候,消费者不会执行
//写者
(*index)++;
for (int i = 0; i < m + 1; i++)
{
PP[i] = PL[i];
PL[i] = L[i];
}
//更新需要计算的值
if (*index <= m)
{
*buffer_size = *index - 1;
L[0] = *index;
L[*index] = *index;
}
else if (*index <= n)
{
*buffer_size = m;
L[0] = *index;
}
else
{
*buffer_size = m + n + 1 - *index;
}
//释放消费者
for (int i = 1; i < cpus; i++)
sem_post(&pth[i]);
sem_post(&wrt);
}
}
struct thread_work_c
{
int *L, *PL, *PP; //三条斜线
std::string str1, str2; //需要比较的两条原始数据
int proccor_index; //处理器编号,从1开始,0号处理器负责生产者
int proccor_size; //处理消费者的处理器数量
int *buffer_size; //需要计算的一条斜边的需要计算的单元数
int *index; //斜边的序列号
int m, n; //行数,列数
int *rc; //读者数量,初始化为0
int *rc1; //控制所有的消费者同时结束
};
void *customer(void *thread_work_uncasted) //消费者线程
{
struct thread_work_c *thread_work = (struct thread_work_c *)thread_work_uncasted;
int m = thread_work->m; //行数
int n = thread_work->n; //列数
int *L = thread_work->L;
int *PL = thread_work->PL;
int *PP = thread_work->PP;
std::string str1 = thread_work->str1;
std::string str2 = thread_work->str2;
int proccor_index_ww = thread_work->proccor_index; //处理器编号
int *buffer_size = thread_work->buffer_size; //计算斜边的单元数
int *index = thread_work->index; //斜边的序列
//由于会修改处理器的编号,所以这里记录处理器原有的值(对应的线程编号)
int proccor_index = proccor_index_ww;
int proccor_size = thread_work->proccor_size; //处理器处理消费者的数量
int *rc = thread_work->rc; //控制读者数量,其实就是执行读进程的cpu数量
int *rc1 = thread_work->rc1; //控制所有的消费者同时结束
int cost, cow, row;
while (*index < m + n)
{
//获取该进程的执行权
sem_wait(&pth[proccor_index]);
sem_wait(&mutex);
(*rc)++;
if (*rc == 1)
sem_wait(&wrt);
sem_post(&mutex);
//消费者,根据处理器的编号,分配计算需要计算斜线上的数据
while (proccor_index <= *buffer_size)
{
//
if (*index < n + 1)
{
cow = *index - proccor_index;
row = proccor_index;
if (str1[cow - 1] == str2[row - 1])
cost = 0;
else
cost = 1;
L[proccor_index] = min(PL[proccor_index - 1] + 1, PL[proccor_index] + 1, PP[proccor_index - 1] + cost);
}
else if (*index > n + 1)
{
row = *index - n + proccor_index - 1;
cow = n - proccor_index + 1;
if (str1[cow - 1] == str2[row - 1])
cost = 0;
else
cost = 1;
L[proccor_index - 1] = min(PL[proccor_index - 1] + 1, PL[proccor_index] + 1, PP[proccor_index] + cost);
}
else
{
row = *index - n + proccor_index - 1;
cow = n - proccor_index + 1;
if (str1[cow - 1] == str2[row - 1])
cost = 0;
else
cost = 1;
L[proccor_index - 1] = min(PL[proccor_index - 1] + 1, PL[proccor_index] + 1, PP[proccor_index - 1] + cost);
}
proccor_index = proccor_index + proccor_size;
}
proccor_index = proccor_index_ww;
//退出
sem_wait(&mutex);
(*rc1)++;
if (*rc1 == proccor_size)
{
(*rc) = 0;
(*rc1) = 0;
*buffer_size = 0;
sem_post(&empty);
sem_post(&wrt);
}
sem_post(&mutex);
}
}
void serial_operation(int m, int n, int index, int *L, int *PL, int *PP, std::string str1, std::string str2)
{
int cow, row, cost, buffer_size;
while (index < m + n + 1)
{
if (index < n + 1)
{
if (index > m)
{
buffer_size = m;
L[0] = index;
}
else
{
buffer_size = index - 1;
L[0] = index;
L[index] = index;
}
for (int i = 1; i <= buffer_size; i++)
{
cow = index - i;
row = i;
if (str1[cow - 1] == str2[row - 1])
cost = 0;
else
cost = 1;
L[i] = min(PL[i - 1] + 1, PL[i] + 1, PP[i - 1] + cost);
}
}
else if (index > n + 1)
{
buffer_size = m + n + 1 - index;
for (int i = 1; i <= buffer_size; i++)
{
row = index - n + i - 1;
cow = n - i + 1;
if (str1[cow - 1] == str2[row - 1])
cost = 0;
else
cost = 1;
L[i - 1] = min(PL[i - 1] + 1, PL[i] + 1, PP[i] + cost);
}
}
else
{
buffer_size = m;
for (int i = 1; i <= buffer_size; i++)
{
cow = index - i;
row = i;
if (str1[cow - 1] == str2[row - 1])
cost = 0;
else
cost = 1;
L[i - 1] = min(PL[i - 1] + 1, PL[i] + 1, PP[i - 1] + cost);
}
}
for (int i = 0; i < m + 1; i++)
{
PP[i] = PL[i];
PL[i] = L[i];
}
index++;
}
}
int main()
{
std::string str1;
std::string str2;
std::getline(std::cin, str1);
std::getline(std::cin, str2);
int m, n;
n = str1.length();
m = str2.length();
int *L = (int *)malloc(sizeof(int) * (m + 1));
int *PL = (int *)malloc(sizeof(int) * (m + 1));
int *PP = (int *)malloc(sizeof(int) * (m + 1));
// Determine the amount of available CPUs
int cpus = get_nprocs();
// nprocs() might return wrong amount inside of a container.
// Use MAX_CPUS instead, if available.
if (getenv("MAX_CPUS"))
{
cpus = atoi(getenv("MAX_CPUS"));
}
if (cpus > m + 1)
{
cpus = m + 1;
}
// Sanity-check
//assert(cpus > 0 && cpus <= 64);
//初始化前面两条斜边
PP[0] = 0;
PL[0] = 1;
PL[1] = 1;
L[0] = 2;
L[2] = 2;
int buffer_size = 1; //第三条斜边需要计算的单元,此时斜边index=2
int index = 2; //前面的两条斜边已经初始化
int rc = 0;
int rc1 = 0;
//cpus = 1;
//串行运算
if (cpus == 1)
{
serial_operation(m, n, index, L, PL, PP, str1, str2);
}
else //并行运算
{
/*初始化互斥信号量为1*/
sem_init(&mutex, 0, 1);
sem_init(&wrt, 0, 1);
sem_init(&empty, 0, 0);//保证先执行消费者代码
pth = (sem_t *)malloc(sizeof(sem_t) * (cpus));
for (int i = 1; i < cpus; i++)
sem_init(&pth[i], 0, 1);
struct thread_work_t str_pro; //生产者结构体,
struct thread_work_c str_cus[cpus - 1]; //消费者结构体
pthread_t thread[cpus];
str_pro.buffer_size = &buffer_size;
str_pro.index = &index;
str_pro.L = L;
str_pro.PL = PL;
str_pro.PP = PP;
str_pro.m = m;
str_pro.n = n;
str_pro.cpus = cpus;
//给消费者结构赋值,并且创建消费者线程
for (int i = 0; i < cpus - 1; i++)
{
str_cus[i].L = L;
str_cus[i].PL = PL;
str_cus[i].PP = PP;
str_cus[i].buffer_size = &buffer_size;
str_cus[i].index = &index;
str_cus[i].m = m;
str_cus[i].n = n;
str_cus[i].proccor_index = i + 1; //n号处理器,从1开始
str_cus[i].proccor_size = cpus - 1; //处理读进程的处理器数量
str_cus[i].str1 = str1;
str_cus[i].str2 = str2;
str_cus[i].rc = &rc;
str_cus[i].rc1 = &rc1;
//创建消费者线程
pthread_create(&thread[i + 1], NULL, customer, (void *)&str_cus[i]);
}
//创建生产者线程
pthread_create(&thread[0], NULL, producer, (void *)&str_pro);
for (int i = 0; i < cpus; i++)
{
// wait for all threads
pthread_join(thread[i], NULL);
}
}
std::cout << L[0] << std::endl;
return 0;
}
该代码是实验代码,目的是在56个核的机器上实现25倍的加速比,但是该代码好像只能实现10倍左右的加速比。原因应该是实验样例的问题,如果较短字符串的长度比较小,机器核比较多,那么加速比只能达到较短字符串的长度。