编辑距离并行化

实验代码思想来源于论文(PRAM和LARPBS模型上的近似串匹配并行算法 by 钟 诚+陈国良),就是将上一篇文章中所叙述的串行代码并行化,利用了生产者与消费者和读写者模型的结合,0号进程执行生产者(写者)方法,其余进程执行消费者(读者)方法,生产者和其余消费者轮流执行,当消费者执行时,生产者等待,当生产者执行时,消费者等待。这里不会造成饥饿问题,因为通过一个一维数组信号量控制消费者只能执行一次,执行完成之后等待,等所有消费者执行完成一次之后,生产者开始执行,执行完成之后,释放所有的消费者,这样形成循环。

代码如下:

#include <iostream>
#include <algorithm>
#include <string>
#include <sstream>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <semaphore.h>
sem_t mutex, wrt, *pth, empty;

template <typename T>
inline auto
min(const T &a, const T &b, const T &c) -> const T &
{
	return std::min(std::min(a, b), c);
}

//生产者结构体
struct thread_work_t
{

	int *L, *PL, *PP; //三条斜线
	int *buffer_size; //需要计算的一条斜边的需要计算的单元数
	int *index;		  //斜边的序列号
	int m, n;		  //行数,列数
	int cpus;		  //cpu总量数量
};
//生产者方法
void *producer(void *thread_work_uncasted)
{
	struct thread_work_t *thread_work = (struct thread_work_t *)thread_work_uncasted;
	int m = thread_work->m; //行数
	int n = thread_work->n; //列数
	int *L = thread_work->L;
	int *PL = thread_work->PL;
	int *PP = thread_work->PP;
	int cpus = thread_work->cpus;
	//int & buffer_size_cur = thread_work->buffer_size_cur;
	int *buffer_size = thread_work->buffer_size; //计算斜边的单元数
	int *index = thread_work->index;			 //斜边的序列

	while ((*index) < m + n)
	{

		sem_wait(&empty); //控制生产者和消费者轮流执行
		sem_wait(&wrt);   //控制生产者在处理数据的时候,消费者不会执行
		//写者
		(*index)++;
		for (int i = 0; i < m + 1; i++)
		{
			PP[i] = PL[i];
			PL[i] = L[i];
		}
		//更新需要计算的值
		if (*index <= m)
		{
			*buffer_size = *index - 1;
			L[0] = *index;
			L[*index] = *index;
		}
		else if (*index <= n)
		{
			*buffer_size = m;
			L[0] = *index;
		}
		else
		{
			*buffer_size = m + n + 1 - *index;
		}
		//释放消费者
		for (int i = 1; i < cpus; i++)
			sem_post(&pth[i]);
		sem_post(&wrt);
	}
}
struct thread_work_c
{
	int *L, *PL, *PP;		//三条斜线
	std::string str1, str2; //需要比较的两条原始数据
	int proccor_index;		//处理器编号,从1开始,0号处理器负责生产者
	int proccor_size;		//处理消费者的处理器数量
	int *buffer_size;		//需要计算的一条斜边的需要计算的单元数
	int *index;				//斜边的序列号
	int m, n;				//行数,列数
	int *rc;				//读者数量,初始化为0
	int *rc1;				//控制所有的消费者同时结束
};

void *customer(void *thread_work_uncasted) //消费者线程
{
	struct thread_work_c *thread_work = (struct thread_work_c *)thread_work_uncasted;
	int m = thread_work->m; //行数
	int n = thread_work->n; //列数
	int *L = thread_work->L;
	int *PL = thread_work->PL;
	int *PP = thread_work->PP;
	std::string str1 = thread_work->str1;
	std::string str2 = thread_work->str2;

	int proccor_index_ww = thread_work->proccor_index; //处理器编号
	int *buffer_size = thread_work->buffer_size;	   //计算斜边的单元数
	int *index = thread_work->index;				   //斜边的序列

	//由于会修改处理器的编号,所以这里记录处理器原有的值(对应的线程编号)
	int proccor_index = proccor_index_ww;

	int proccor_size = thread_work->proccor_size; //处理器处理消费者的数量
	int *rc = thread_work->rc;					  //控制读者数量,其实就是执行读进程的cpu数量
	int *rc1 = thread_work->rc1;				  //控制所有的消费者同时结束
	int cost, cow, row;
	while (*index < m + n)
	{

		//获取该进程的执行权
		sem_wait(&pth[proccor_index]);
		sem_wait(&mutex);
		(*rc)++;
		if (*rc == 1)
			sem_wait(&wrt);
		sem_post(&mutex);

		//消费者,根据处理器的编号,分配计算需要计算斜线上的数据
		while (proccor_index <= *buffer_size)
		{
			//
			if (*index < n + 1)
			{
				cow = *index - proccor_index;
				row = proccor_index;
				if (str1[cow - 1] == str2[row - 1])
					cost = 0;
				else
					cost = 1;
				L[proccor_index] = min(PL[proccor_index - 1] + 1, PL[proccor_index] + 1, PP[proccor_index - 1] + cost);
			}
			else if (*index > n + 1)
			{
				row = *index - n + proccor_index - 1;
				cow = n - proccor_index + 1;
				if (str1[cow - 1] == str2[row - 1])
					cost = 0;
				else
					cost = 1;
				L[proccor_index - 1] = min(PL[proccor_index - 1] + 1, PL[proccor_index] + 1, PP[proccor_index] + cost);
			}
			else
			{
				row = *index - n + proccor_index - 1;
				cow = n - proccor_index + 1;
				if (str1[cow - 1] == str2[row - 1])
					cost = 0;
				else
					cost = 1;
				L[proccor_index - 1] = min(PL[proccor_index - 1] + 1, PL[proccor_index] + 1, PP[proccor_index - 1] + cost);
			}
			proccor_index = proccor_index + proccor_size;
		}
		proccor_index = proccor_index_ww;
		//退出
		sem_wait(&mutex);
		(*rc1)++;
		if (*rc1 == proccor_size)
		{
			(*rc) = 0;
			(*rc1) = 0;
			*buffer_size = 0;
			sem_post(&empty);
			sem_post(&wrt);
		}
		sem_post(&mutex);
	}
}
void serial_operation(int m, int n, int index, int *L, int *PL, int *PP, std::string str1, std::string str2)
{
	int cow, row, cost, buffer_size;
	while (index < m + n + 1)
	{
		if (index < n + 1)
		{
			if (index > m)
			{
				buffer_size = m;
				L[0] = index;
			}
			else
			{
				buffer_size = index - 1;
				L[0] = index;
				L[index] = index;
			}

			for (int i = 1; i <= buffer_size; i++)
			{
				cow = index - i;
				row = i;
				if (str1[cow - 1] == str2[row - 1])
					cost = 0;
				else
					cost = 1;
				L[i] = min(PL[i - 1] + 1, PL[i] + 1, PP[i - 1] + cost);
			}
		}
		else if (index > n + 1)
		{
			buffer_size = m + n + 1 - index;
			for (int i = 1; i <= buffer_size; i++)
			{
				row = index - n + i - 1;
				cow = n - i + 1;
				if (str1[cow - 1] == str2[row - 1])
					cost = 0;
				else
					cost = 1;
				L[i - 1] = min(PL[i - 1] + 1, PL[i] + 1, PP[i] + cost);
			}
		}
		else
		{
			buffer_size = m;
			for (int i = 1; i <= buffer_size; i++)
			{
				cow = index - i;
				row = i;
				if (str1[cow - 1] == str2[row - 1])
					cost = 0;
				else
					cost = 1;
				L[i - 1] = min(PL[i - 1] + 1, PL[i] + 1, PP[i - 1] + cost);
			}
		}

		for (int i = 0; i < m + 1; i++)
		{
			PP[i] = PL[i];
			PL[i] = L[i];
		}
		index++;
	}
}
int main()
{

	std::string str1;
	std::string str2;
	std::getline(std::cin, str1);
	std::getline(std::cin, str2);
	int m, n;
	n = str1.length();
	m = str2.length();
	
	int *L = (int *)malloc(sizeof(int) * (m + 1));
	int *PL = (int *)malloc(sizeof(int) * (m + 1));
	int *PP = (int *)malloc(sizeof(int) * (m + 1));
	// Determine the amount of available CPUs
	int cpus = get_nprocs();
	// nprocs() might return wrong amount inside of a container.
	// Use MAX_CPUS instead, if available.
	if (getenv("MAX_CPUS"))
	{
		cpus = atoi(getenv("MAX_CPUS"));
	}

	if (cpus > m + 1)
	{
		cpus = m + 1;
	}

	// Sanity-check
	//assert(cpus > 0 && cpus <= 64);
	//初始化前面两条斜边
	PP[0] = 0;
	PL[0] = 1;
	PL[1] = 1;
	L[0] = 2;
	L[2] = 2;
	int buffer_size = 1; //第三条斜边需要计算的单元,此时斜边index=2
	int index = 2;		 //前面的两条斜边已经初始化
	int rc = 0;
	int rc1 = 0;
	//cpus = 1;
	//串行运算
	if (cpus == 1)
	{
		serial_operation(m, n, index, L, PL, PP, str1, str2);
	}
	else //并行运算
	{
		
		/*初始化互斥信号量为1*/
		sem_init(&mutex, 0, 1);
		sem_init(&wrt, 0, 1);
		sem_init(&empty, 0, 0);//保证先执行消费者代码

		pth = (sem_t *)malloc(sizeof(sem_t) * (cpus));
		for (int i = 1; i < cpus; i++)
			sem_init(&pth[i], 0, 1);

		struct thread_work_t str_pro;			//生产者结构体,
		struct thread_work_c str_cus[cpus - 1]; //消费者结构体

		pthread_t thread[cpus];

		str_pro.buffer_size = &buffer_size;
		str_pro.index = &index;
		str_pro.L = L;
		str_pro.PL = PL;
		str_pro.PP = PP;
		str_pro.m = m;
		str_pro.n = n;
		str_pro.cpus = cpus;

		//给消费者结构赋值,并且创建消费者线程
		for (int i = 0; i < cpus - 1; i++)
		{
			str_cus[i].L = L;
			str_cus[i].PL = PL;
			str_cus[i].PP = PP;

			str_cus[i].buffer_size = &buffer_size;
			str_cus[i].index = &index;
			str_cus[i].m = m;
			str_cus[i].n = n;
			str_cus[i].proccor_index = i + 1;   //n号处理器,从1开始
			str_cus[i].proccor_size = cpus - 1; //处理读进程的处理器数量
			str_cus[i].str1 = str1;
			str_cus[i].str2 = str2;
			str_cus[i].rc = &rc;
			str_cus[i].rc1 = &rc1;
			//创建消费者线程
			pthread_create(&thread[i + 1], NULL, customer, (void *)&str_cus[i]);
		}

		//创建生产者线程

		pthread_create(&thread[0], NULL, producer, (void *)&str_pro);

		for (int i = 0; i < cpus; i++)
		{
			// wait for all threads
			pthread_join(thread[i], NULL);
		}
	}
	std::cout << L[0] << std::endl;
	return 0;
}

该代码是实验代码,目的是在56个核的机器上实现25倍的加速比,但是该代码好像只能实现10倍左右的加速比。原因应该是实验样例的问题,如果较短字符串的长度比较小,机器核比较多,那么加速比只能达到较短字符串的长度。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值