C++ 1亿数据二次排序

一亿条数据,超过1GB,直接双击无法打开,可以用IDEA或者PYCHARM打开。

每条数据有两个整型数据,排序时,先按第一个字段排序,第一个字段相同时,按第二个字段排序,都采用升序。

主要思想:第一个字段相同的数据,全都放入一个数据结构里,该数据结构包括一个key值和一个容器,例如(1,2),(1,5),(1,3)这组数据,放入该数据结构后变为1,(2,5,3),对(2,5,3)进行排序,达到“局部有序”状态,另外,对key进行排序,达到“全局有序”的状态,到底是先做局部还是全局的排序,哪种方式能提高排序速度,取决于哪种方式的并行能力更好。并行主要来源于两个地方:一是并行读文件,二是多组数据并行排序(前提条件是将所有数据分别放入多个相互独立的内存区域)。采用多线程时,制约并行能力的主要因素是“容器不是线程安全的”,下图中,尽量保证程序运行期间,CPU有尽量多的时间处于高利用率状态,例如,当开启20个线程时,对于16核心32线程的CPU而言,CPU利用率达到60%时,是比较理想的状态。

程序运行时间是2.4秒左右,包括读1G文件、排序、写1G结果文件。

前半部分的并行主要是并行读文件,主要思想是先定位文件中的20个位置,然后20个线程同时读取文件内容。

后半部分的并行包括排序的并行和写文件的并行,最后把排序结果写到了20个文件中。


中间有一段时间,CPU的利用率只有3%,这段时间没有多线程,如果想进一步加快程序运行速度,要考虑提高程序运行的并行程度,但是,“容器不是线程安全的”这一点很难解决,程序应该还有提高的余地。之前无聊写过一个hash表用于单词计数,于是在这个hash表的基础上改写,实现了二次排序,程序有点冗长。

#include <thread>
#include <algorithm>
#include <vector>
#include <iostream>
#include <execution>
using namespace std;

#define HASHTABLESIZE 5000  
#define THREADNUM 20

struct ElementOfHashTable
{
	int key;
	int count;
	vector<int> vec_num;
	ElementOfHashTable* next;
};

int FileToSortedHashTableThread(FILE* fp, int address[], ElementOfHashTable** sortedHashTable, int tid);
void MergeSortedHashTable(ElementOfHashTable sortedHashTable1[], ElementOfHashTable sortedHashTable2[], ElementOfHashTable sortedHashTable[]);
int HashTableToFile(ElementOfHashTable sortedHashTableTotal[]);
int SortThread(ElementOfHashTable sortedHashTableTotal[], int indexForThread[], int tid);
bool KeyCmp(const ElementOfHashTable& s1, const ElementOfHashTable& s2);

int main()
{
	clock_t t1, t2, t3, t4, t5;
	double duration;

	t1 = clock();

	ElementOfHashTable** sortedHashTable = new ElementOfHashTable * [THREADNUM];

	for (int i = 0; i < THREADNUM; i++)
	{
		sortedHashTable[i] = new ElementOfHashTable[HASHTABLESIZE];
	}

	ElementOfHashTable* sortedHashTableTotal = new ElementOfHashTable[HASHTABLESIZE];

	for (int n = 0; n < THREADNUM; n++)
	{
		for (int i = 0; i < HASHTABLESIZE; i++)
		{
			sortedHashTable[n][i].key = 0;
			sortedHashTable[n][i].count = 0;
			sortedHashTable[n][i].vec_num.clear();
			sortedHashTable[n][i].next = NULL;
		}
	}

	for (int i = 0; i < HASHTABLESIZE; i++)
	{
		sortedHashTableTotal[i].key = 0;
		sortedHashTableTotal[i].count = 0;
		sortedHashTableTotal[i].vec_num.clear();
		sortedHashTableTotal[i].next = NULL;
	}

	FILE* fp = NULL;
	fopen_s(&fp, "e:\\10000w.txt", "rb");
	if (fp == NULL)
	{
		cout << "open file error" << endl;
		return -1;
	}
		
	fseek(fp, 0, SEEK_END);
	int filesize = ftell(fp);
	cout << "filesize: " << (double)filesize / (1024.0 * 1024.0 * 1024.0) << " GB" << endl;

	int blockSize = int(filesize / THREADNUM);
	int address[THREADNUM + 1] = { 0 };
	address[THREADNUM] = filesize;

	if (THREADNUM > 1)
	{
		for (int i = 1; i < THREADNUM; i++)
		{
			int predictAddress = blockSize * i;
			fseek(fp, predictAddress, SEEK_SET);
			while (fgetc(fp) != ';')
			{
				predictAddress++;
			}
			address[i] = predictAddress;
		}
	}

	t2 = clock();
	duration = ((double)t2 - (double)t1) / CLOCKS_PER_SEC;
	cout << "ready to read " << (double)filesize / (1024.0 * 1024.0 * 1024.0) << " GB file, get " << THREADNUM << " position in file : " << duration << " seconds" << endl;
	cout << "--------------------------------------------------------" << endl;

	thread td[THREADNUM];

	for (int i = 0; i < THREADNUM; i++)
	{
		td[i] = thread(&FileToSortedHashTableThread, fp, address, sortedHashTable, i);
	}

	for (int i = 0; i < THREADNUM; i++)
	{
		td[i].join();
	}

	t3 = clock();
	duration = ((double)t3 - (double)t2) / CLOCKS_PER_SEC;
	cout << "get sorted hash table for " << THREADNUM << " threads:  " << duration << " seconds" << endl;
	cout << "--------------------------------------------------------" << endl;

	if (THREADNUM > 1)
	{
		ElementOfHashTable* tempHashTable = new ElementOfHashTable[HASHTABLESIZE];

		for (int i = 0; i < HASHTABLESIZE; i++)
		{
			tempHashTable[i].count = sortedHashTable[0][i].count;
			tempHashTable[i].vec_num.assign(sortedHashTable[0][i].vec_num.begin(), sortedHashTable[0][i].vec_num.end());  ///???
			tempHashTable[i].key = sortedHashTable[0][i].key;
			tempHashTable[i].next = sortedHashTable[0][i].next;
		}

		for (int n = 1; n < THREADNUM; n++)
		{
			MergeSortedHashTable(tempHashTable, sortedHashTable[n], sortedHashTableTotal);

			for (int i = 0; i < HASHTABLESIZE; i++)
			{
				tempHashTable[i].count = sortedHashTableTotal[i].count;
				tempHashTable[i].vec_num.assign(sortedHashTableTotal[i].vec_num.begin(), sortedHashTableTotal[i].vec_num.end());
				tempHashTable[i].key = sortedHashTableTotal[i].key;
				tempHashTable[i].next = sortedHashTableTotal[i].next;
			}
		}
	}
	else
	{
		for (int i = 0; i < HASHTABLESIZE; i++)
		{
			sortedHashTableTotal[i].count = sortedHashTable[0][i].count;
			sortedHashTableTotal[i].vec_num.assign(sortedHashTable[0][i].vec_num.begin(), sortedHashTable[0][i].vec_num.end());
			sortedHashTableTotal[i].key = sortedHashTable[0][i].key;
			sortedHashTableTotal[i].next = sortedHashTable[0][i].next;
		}
	}

	t4 = clock();
	duration = ((double)t4 - (double)t3) / CLOCKS_PER_SEC;
	cout << "merge sorted hash table of " << THREADNUM << " threads:  " << duration << " seconds" << endl;
	cout << "--------------------------------------------------------" << endl;

	HashTableToFile(sortedHashTableTotal);

	t5 = clock();
	duration = ((double)t5 - (double)t4) / CLOCKS_PER_SEC;
	cout << "write hash table to file:  " << duration << " seconds" << endl;
	cout << "--------------------------------------------------------" << endl;

	duration = ((double)t5 - (double)t1) / CLOCKS_PER_SEC;
	cout << "total time: " << duration << " seconds" << endl;
	cout << "--------------------------------------------------------" << endl;

	if (fp != NULL)
		fclose(fp);

	return 0;
}

int FileToSortedHashTableThread(FILE* fp, int address[], ElementOfHashTable** sortedHashTable, int tid)
{
	int beginAddress = address[tid];
	int endAddress = address[tid + 1];

	ElementOfHashTable* hashTable = new ElementOfHashTable[HASHTABLESIZE];
	for (int i = 0; i < HASHTABLESIZE; i++)
	{
		hashTable[i].key = 0;
		hashTable[i].count = 0;
		hashTable[i].vec_num.clear();
		hashTable[i].next = NULL;
	}

	char* s = new char[HASHTABLESIZE];
	int i, k;
	int sum, factor;
	int index;
	ElementOfHashTable* p = NULL;
	ElementOfHashTable* q = NULL;
	ElementOfHashTable* pNewNode = NULL;

	char* line = (char*)malloc(((size_t)endAddress - (size_t)beginAddress) + 1);
	if (line == NULL)
	{
		cout << "memory alloc error" << endl;
		return -2;
	}

	fseek(fp, beginAddress, SEEK_SET);
	fread_s(line, (size_t)endAddress - (size_t)beginAddress, 1, (size_t)endAddress - (size_t)beginAddress, fp);
	line[endAddress - beginAddress] = '\0';
	
	i = 0;
	
	char num[20];
	int addressInterval = endAddress - beginAddress;
	while (i < addressInterval)
	{
		int j1 = 0, j2 = 0;

		while (line[i] != '\0' && !(line[i] >= '0' && line[i] <= '9'))i++;

		while ((line[i] >= '0' && line[i] <= '9'))
		{
			s[j1++] = line[i++];
		}
		s[j1] = '\0';
		i++;

		while (line[i] >= '0' && line[i] <= '9')
		{
			num[j2++] = line[i++];
		}
		num[j2] = '\0';
		i++;

		k = 0;
		sum = 0;
		factor = 10;

		while (s[k] != '\0')  //散列函数,计算index(slot)
		{
			sum = sum * factor + (s[k] - 48);
			k++;
		}
		index = sum % (HASHTABLESIZE - 3);

		//散列冲突检查
		if (hashTable[index].vec_num.empty())
		{
			hashTable[index].key = atoi(s);
			hashTable[index].count = 1;
			hashTable[index].vec_num.push_back(atoi(num));  
		}
		else if (hashTable[index].key == atoi(s))
		{
			hashTable[index].count++;
			hashTable[index].vec_num.push_back(atoi(num));
		}
		else
		{
			//冲突,在list中查找
			p = hashTable[index].next;
			q = &hashTable[index];
			int flag = 0;

			while (p != NULL && flag == 0)
			{
				if (p->key == atoi(s))
				{
					p->count++;
					p->vec_num.push_back(atoi(num));
					flag = 1;
				}
				else
				{
					q = p;
					p = p->next;
				}
			}

			if (p == NULL || flag == 0) 	//未找到,创建node
			{
				pNewNode = new ElementOfHashTable;
				pNewNode->key = atoi(s);
				pNewNode->count = 1; // no 0
				pNewNode->vec_num.push_back(atoi(num));
				pNewNode->next = NULL;
				q->next = pNewNode;
			}
		}
	}

	int hashTableElementNum = 0;
	int j = 0;

	for (int i = 0; i < HASHTABLESIZE; i++)
	{
		if (!hashTable[i].vec_num.empty())
		{
			sortedHashTable[tid][j].key = hashTable[i].key;
			sortedHashTable[tid][j].count = hashTable[i].count;
			sortedHashTable[tid][j].vec_num.assign(hashTable[i].vec_num.begin(), hashTable[i].vec_num.end());
			j++;

			p = hashTable[i].next;
			while (p != NULL)
			{
				sortedHashTable[tid][j].key = p->key;
				sortedHashTable[tid][j].count = p->count;
				sortedHashTable[tid][j].vec_num.assign(p->vec_num.begin(), p->vec_num.end());
				j++;
				p = p->next;
			}
		}
	}

	partial_sort(&sortedHashTable[tid][0], &sortedHashTable[tid][hashTableElementNum - 1], &sortedHashTable[tid][hashTableElementNum - 1], KeyCmp);

	return 0;
}


bool KeyCmp(const ElementOfHashTable& s1, const ElementOfHashTable& s2)
{
	return s1.key < s2.key;
};


void MergeSortedHashTable(ElementOfHashTable sortedHashTable1[], ElementOfHashTable sortedHashTable2[], ElementOfHashTable sortedHashTable[])
{
	int i = 0, j = 0, k = 0;

	for (; i < HASHTABLESIZE && j < HASHTABLESIZE;)
	{
		if (sortedHashTable1[i].key == 0 || sortedHashTable2[j].key == 0)
		{
			break;
		}
			
		if (sortedHashTable1[i].key < sortedHashTable2[j].key)
		{
			sortedHashTable[k].count = sortedHashTable1[i].count;
			sortedHashTable[k].vec_num.assign(sortedHashTable1[i].vec_num.begin(), sortedHashTable1[i].vec_num.end());
			sortedHashTable[k].key = sortedHashTable1[i].key;
			k++; i++;
		}
		else if (sortedHashTable1[i].key > sortedHashTable2[j].key)
		{
			sortedHashTable[k].count = sortedHashTable2[j].count;
			sortedHashTable[k].vec_num.assign(sortedHashTable2[j].vec_num.begin(), sortedHashTable2[j].vec_num.end());
			sortedHashTable[k].key = sortedHashTable2[j].key;
			k++; j++;
		}
		else if (sortedHashTable1[i].key == sortedHashTable2[j].key)
		{
			sortedHashTable[k].key = sortedHashTable1[i].key;
			sortedHashTable[k].count = sortedHashTable1[i].count + sortedHashTable2[j].count;
			sortedHashTable[k].vec_num.clear();   之前count是被两个count之和覆盖,这里的vec也应该被覆盖(所以必须先清空)
			sortedHashTable[k].vec_num.insert(sortedHashTable[k].vec_num.end(), sortedHashTable1[i].vec_num.begin(), sortedHashTable1[i].vec_num.end());
			sortedHashTable[k].vec_num.insert(sortedHashTable[k].vec_num.end(), sortedHashTable2[j].vec_num.begin(), sortedHashTable2[j].vec_num.end());
			k++; j++; i++;
		}
	}

	if (sortedHashTable1[i].key == 0)
	{
		for (; j < HASHTABLESIZE; j++)
		{
			if (sortedHashTable2[j].key == 0)
				break;
			sortedHashTable[k].count = sortedHashTable2[j].count;
			sortedHashTable[k].vec_num.assign(sortedHashTable2[j].vec_num.begin(), sortedHashTable2[j].vec_num.end());
			sortedHashTable[k].key = sortedHashTable2[j].key;
			k++;
		}
	}

	if (sortedHashTable2[j].key == 0)
	{
		for (; i < HASHTABLESIZE; i++)
		{
			if (sortedHashTable1[i].key == 0)
				break;
			sortedHashTable[k].count = sortedHashTable1[i].count;
			sortedHashTable[k].vec_num.assign(sortedHashTable1[i].vec_num.begin(), sortedHashTable1[i].vec_num.end());
			sortedHashTable[k].key = sortedHashTable1[i].key;
			k++;
		}
	}
}

int HashTableToFile(ElementOfHashTable sortedHashTableTotal[])
{
	int totalTokenNum = 0;
	int hashTableItems = 0;

	for (int i = 0; i < HASHTABLESIZE; i++)
	{
		if (!sortedHashTableTotal[i].vec_num.empty())
		{
			hashTableItems++;
			totalTokenNum += sortedHashTableTotal[i].count;
		}
	}

	int itemsPerThread;
	itemsPerThread = hashTableItems / THREADNUM;	
	int indexForThread[THREADNUM + 1] = { 0 };
	indexForThread[0] = 0;
	indexForThread[THREADNUM] = hashTableItems;
	for (int i = 1; i < THREADNUM; i++)
	{
		indexForThread[i] = i * itemsPerThread;
	}

	thread td[THREADNUM];

	for (int i = 0; i < THREADNUM; i++)
	{
		td[i] = thread(SortThread, sortedHashTableTotal, indexForThread, i);
	}

	for (int i = 0; i < THREADNUM; i++)
	{
		td[i].join();
	}

	cout << "数据个数: " << totalTokenNum << endl;

	return 0;
}

int SortThread(ElementOfHashTable sortedHashTableTotal[], int indexForThread[], int tid)
{
	int beginNum = sortedHashTableTotal[indexForThread[tid]].key;
	int endNum = sortedHashTableTotal[indexForThread[tid + 1] - 1].key;

	char fileName[40] = "e:\\10000wsort\\";
	char begin[10] = "";
	char end[10] = "";
	_itoa_s(beginNum, begin, 10);
	strcat_s(fileName, begin);
	strcat_s(fileName, "-");
	_itoa_s(endNum, end, 10);
	strcat_s(fileName, end);
	strcat_s(fileName, ".txt");
	
	FILE* fp = NULL;
	fopen_s(&fp, fileName, "w");
	if (fp == NULL)
	{
		cout << "open file error" << endl;
		return -1;
	}

	for (int i = indexForThread[tid]; i < indexForThread[tid + 1]; i++)
	{
		sort(sortedHashTableTotal[i].vec_num.begin(), sortedHashTableTotal[i].vec_num.end(), less<int>());
		for (vector<int>::size_type index = 0; index < sortedHashTableTotal[i].vec_num.size(); index++)
		{
			char temp1[20] = "";
			char temp[20] = "";
			_itoa_s(sortedHashTableTotal[i].key, temp, 10);
			strcat_s(temp1, temp);
			strcat_s(temp1, " ");
			char temp2[20];
			_itoa_s(sortedHashTableTotal[i].vec_num[index], temp2, 10);
			strcat_s(temp1, temp2);
			strcat_s(temp1, ";");
			fputs(temp1, fp);
		}
		fputs("\n", fp);
	}

	if (fp != NULL)
	{
		fclose(fp);
	}
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值