Lock-free 多核数据结构设计

最新推荐文章于 2022-07-27 22:47:20 发布

joeywen

最新推荐文章于 2022-07-27 22:47:20 发布

阅读量1.2k

点赞数

分类专栏：程序算法分布式计算文章标签：数据结构多核 hashtable lock-free

程序算法同时被 2 个专栏收录

43 篇文章 0 订阅

订阅专栏

分布式计算

25 篇文章 0 订阅

订阅专栏

lock-free思想背景

基本的多核数据结构设计是非常简单的：只需要在并发处理同一数据结构时，加上locks就可以了。这种思想在并发数不是很多的情况下工作的很好。因为这时的资源争用开销并不是很大。

随着多核机器逐渐的变大变多，例如成百上千的核，这种加lock机制对此应用的不是很好:如果你只有几个锁,锁的争夺成为巨大的;如果你有许多细粒度的锁,锁的开销开始增加。

所以对于大型多核机器来说，“Lock-free”的设计非常常见。以下这些策略基本覆盖了这种设计思想：

1、我们不每次都是用lock，但只有当我们真的需要它们的时候（例如只写不读）；

2、我们不是每秒都用lock，但是我们可以自己构建相同的东西（loop until free[不知翻译成什么好]）；

3、我们不使用lock，相反我们采用原子指令操作;

4、我们不使用lock，因为我们假设并发线程不同时访问同一数据；

5、我们不使用lock，因为我们可以证明多线程从来不在同一时间访问同一数据

一个非常严重而且非常复杂的因素就是机器的“内存一致性模型”：如何使一个线程正在写的数据同时允许另一个线程查看。如果你把数据先后写在内存的两个地方A，然后是B。一些线程开始从B处读取你的新数据，你可能期望它们也会那样在A进行相同的操作。不幸的是，你写在A内存块中的数据很有可能被放在了存储缓冲区中，所以导致B处写的数据要早于A出现。因此，所有的机器都要求设置“内存栅栏”操作来确保所有的写操作都对外可见。

Example：Lock-Free Hashtable 设计

这里有一个很好的例子on Dr. Cliff Click's lock-free hashtable（in Java,for "Azul" native Java 54-core/chip massive muticore hardware）。Dr Cliff Click思想很简单直接，他把内存一致性模型暴露给了用户。lock free的写操作时很复杂的，特别是重新调整hashtable的大小时。

以下是Dr. Lawlor采取Cliff的思想，但是去掉了最棘手的调整操作。这些代码只适合于小测试。

#include <omp.h>

/*
For performance debugging: count hashtable collisions
*/

int collisions = 0;

/*A lockness hashtable. Both reads and writes are lock-free.
Does not yet implement hashtable resize.*/

template <class KEY, class VALUE>
class LockFreeHashtable {
	KEY missingk; // invalid key
	VALUE missingv; // invalid key

	long size;
	struct KEY_VALUE{
		KEY k;
		VALUE v;
	};

	volatile KEY_VALUE *data;
	void operator=(const LockFreeHashtable &){} // do not copy use
public:
	LockFreeHashtable(long size_, KEY missingk=KEY(), VALUE missingv=VALUE())
		:missingk(missingk_), missingv(missingv_), size(0), data(0) {
			reinit(size_);
	}

	~LockFreeHashtable() {delete[] data;}

	/* Reinitialize to be this new size*/
	void reinit(long size_) {
		delete[] data;
		size = size_;
		data = new KEY_VALUE[size];

		for (int i = 0; i < size; ++i) {
			data[i].k = missingk;
			data[i].v = missingv;
		}
	}

	/*read the VALUE for the KEY*/
	volatile const VALUE &get(consts KEY &k) {
		int idx = k;
		while (true) { /*while we haven't found that key yet*/
			idx &= size - 1; /*===     idx %= size; */
			if (data[idx].k == k) { /*old key*/
				return data[idx].v;
			}

			if (data[idx].k == missingk) { /*missing key*/
				return missingv;
			}

			idx ++; /*move down: keep looking !*/
#pragma omp atomic
			collisions ++;
		}
	}

	/*Writes a copy of Value for this KEY*/
	void put(const KEY &k, const VALUE &v) {
		int idx = k;
		while(true) {
			idx &= size -1; /*===   idx %= size;  */
		check_key:
			if (data[idx].k == k) {/*fast path:resue old key*/
				data[idx].v = v;
				return;
			}

			if (data[idx].k == missingk) { /*try to claim new key*/
				data[idx].k = k; /*provisinal ownership*/
				goto check_key; /*subtle: check ownership below *before* use */
			}
			idx ++; // move down: keep looking;
#pragma omp atomic
			collisions ++;
		}
	}

	/*Count number of valid values in table */
	int count (void) {
		int sum = 0;
		for (int i = 0; i < size; ++i) {
			if (data[i].k != missingk) sum ++;
		}

		return sum;
	}
};

LockFreeHashtable<int, float> h(0, -1, -99.0);

enum {n = 100000}; /*total number of hashtable operations*/

inline int make_key(int i) {
	i *= 8193; /*randomizing function*/
	return i ^ (i >> 16);
}

int do_hashtable_writes(void) {
	collisions = 0;
#pragma omp parallel for
	for (int i = 0; i < n; ++i) {
		h.put(make_key(i), i * 1.23456);
	}

	return 0;
}

int do_hashtable_reads(void) {
	collisions = 0;
	double sum = 0.0;

#pragma omp parallel for reduction(+:sum)
	for (int i = 0; i < n; ++i) {
		sum += h.get(make_key(i));
	}

	return sum;
}

int foo(void) {
	for (int nthread = 1; nthread <= 4; nthread *=2) {
		h.reinit(1024 * 512);
		printf("%d threads : \n", nthread);

		omp_set_num_threads(nthread);

		double t = time_function(do_hashtable_writes);
		printf("	writes: %.3f ns per (%.3f ms total)\n",t*1.0e9/n,t*1.0e3);
		std::cout <<"	collisions: "<< collisions <<"\n";
		std::cout <<"	total values: "<< h.count() <<"\n";

		t = time_function(do_hashtable_reads);
		printf("	reads: %.3f ns per (%.3f ms total)\n",t*1.0e9/n,t*1.0e3);
		std::cout << "	collisions: " << collisions << "\n";
	}

	return 0;
}

在collisions很少的情况下，可伸缩性还是很好的