2020中兴捧月算法大赛埃德加考特派区域优胜奖源代码

最新推荐文章于 2022-05-02 00:18:44 发布

巴塞罗那的风

最新推荐文章于 2022-05-02 00:18:44 发布

阅读量775

点赞数

分类专栏： 2020中兴捧月大赛总结及代码文章标签：算法 c++ 数据库中兴捧月算法大赛

本文链接：https://blog.csdn.net/qq_36437446/article/details/109031901

版权

2020中兴捧月大赛总结及代码专栏收录该内容

2 篇文章 10 订阅

订阅专栏

说明：

该版本最快时长为1100ms左右，中间有一些我当时分析各个操作时间占比的代码并没有注释掉

说明：
并非最终获奖版本，该版本最快时长为1100ms左右

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include<algorithm>
#include<functional>
#include<ctime>
#include<unordered_map>

using namespace std;
typedef int dataType;















//save csv struct
typedef vector<vector<dataType>> inputStruct;

void showInputStruct(const inputStruct& input);
// read csv file
inputStruct readCsv(const char*);
//sort-Merge
inputStruct joinCsv_sortMerge(inputStruct&, inputStruct&, int, int);
//gropyBy with sort
void gropyBy_sort2(inputStruct&, int, int, vector<dataType>&);
//计算每个group的max(t1.id1)
int findMax2(int, int);
//计算每个group的min(t2.id1)
int findMin2(int, int);

inputStruct res;

int main()
{
	iostream::sync_with_stdio(false);
	clock_t stread = clock();
	//读取两个数据文件
	inputStruct input1 = readCsv("/home/web/ztedatabase/myinput1.csv");
	inputStruct input2 = readCsv("/home/web/ztedatabase/myinput2.csv");
	clock_t endread = clock();
	cout << "read time:" << (endread - stread) / CLOCKS_PER_SEC << endl;
	clock_t stjoin = clock();
	//sort-merge排序
	res = joinCsv_sortMerge(input1, input2, 3, 3);
	clock_t endjoin = clock();
	cout << "join file time:" << (endjoin - stjoin) / CLOCKS_PER_SEC << endl;
	input1.clear();
	input2.clear();
	vector<dataType> index(0);
	clock_t stsort2 = clock();
	//分组，index用来标记不同group的起始位置
	gropyBy_sort2(res, 1, 4, index);
	index.push_back(res.size());
	clock_t endsort2 = clock();
	cout << "group file time:" << (endsort2 - stsort2) / CLOCKS_PER_SEC << endl;
	int size_index = index.size();
	clock_t stzhuan = clock();
	//复用原有空间
	for (int i = 0;i < size_index - 1;i++)
	{
		int max = findMax2(index[i], index[i + 1]);
		int min = findMin2(index[i], index[i + 1]);
		res[i][0] = max;
		res[i][1] = min;
		res[i][2] = res[index[i]][4];
		res[i][3] = res[index[i]][2];
	}
	clock_t endzhuan = clock();
	cout << "zhuan time:" << (endzhuan - stzhuan) / CLOCKS_PER_SEC << endl;
	clock_t stsort = clock();
	//最后的sort
	sort(res.begin(), res.begin() + size_index - 1, [](vector<dataType>& input1, vector<dataType>& input2) {
		if (input1[0] < input2[0])
		{
			return true;
		}
		else if (input1[0] > input2[0])
		{
			return false;
		}
		else if (input1[0] == input2[0])
		{
			if (input1[2] < input2[2])
			{
				return true;
			}
			else if (input1[2] > input2[2])
			{
				return false;
			}
			else if (input1[2] == input2[2])
			{
				if (input1[3] < input2[3])
				{
					return true;
				}
				else if (input1[3] > input2[3])
				{
					return false;
				}
				else if (input1[3] == input2[3])
				{
					return false;
				}
			}
		}
		});
	clock_t endsort = clock();
	cout << "sort time:" << (endsort - stsort) / CLOCKS_PER_SEC << endl;
	//输出结果
	for (int i = 0;i < size_index - 1;i++)
	{
		cout<<res[i][0]<<","<<res[i][1]<<endl;
	}
	return 0;
}

inputStruct readCsv(const char* filename)
{
	//读取每一行，根据逗号进行切分
	inputStruct strArray;
	FILE* file;
	fopen_s(&file,filename, "r");
	if (file)
	{
		char* buf;    
		int len;     

		fseek(file, 0, SEEK_END);    
		len = ftell(file);         
		rewind(file);               
		buf = (char*)malloc(sizeof(char) * len + 1);
		if (!buf)
		{
			cout << "error in  malloc.\n";
		}
		int rLen = fread(buf, sizeof(char), len, file);
		buf[rLen] = '\0';
		string strbuf = buf;
		string line;
		stringstream ss(strbuf);
		stringstream ss2;
		string temp;
		vector<dataType> lineArray;
		while (getline(ss, line, '\n'))
		{
			stringstream ss2(line);
			while (getline(ss2, temp, ','))
			{
				lineArray.push_back(atoi(temp.c_str()));
			}
			strArray.push_back(lineArray);
			lineArray.clear();
		}

	}
	
	return strArray;
}

void showInputStruct(const inputStruct& input)
{
	for (int i = 0;i < input.size();i++)
	{
		for (int j = 0;j < input[i].size();j++)
		{
			cout << input[i][j] << " ";
		}
		cout << endl;
	}
}

inputStruct joinCsv_sortMerge(inputStruct& input1, inputStruct& input2, int t1, int t2)
{
	//代码思路见本专栏另一篇博客
	int size1 = input1.size();
	int size2 = input2.size();
	inputStruct joinRes(0);
	sort(input1.begin(), input1.end(), [](vector<dataType>& input1, vector<dataType>& input2) {
		return input1[2] < input2[2];
		});
	sort(input2.begin(), input2.end(), [](vector<dataType>& input1, vector<dataType>& input2) {
		return input1[2] < input2[2];
		});
	int i = 0, j = 0;
	vector<dataType> temp(0);
	while (i < size1 && j < size2)
	{
		if (input1[i][t1 - 1] > input2[j][t2 - 1])
		{
			j++;
		}
		else if (input1[i][t1 - 1] < input2[j][t2 - 1])
		{
			i++;
		}
		else if (input1[i][t1 - 1] == input2[j][t2 - 1])
		{
			temp.clear();
			temp.push_back(input1[i][0]);
			temp.push_back(input1[i][1]);
			temp.push_back(input1[i][2]);
			temp.push_back(input2[j][0]);
			temp.push_back(input2[j][1]);
			temp.push_back(input2[j][2]);
			joinRes.push_back(temp);
			int j1 = j + 1;
			while (j1 < size2 && input1[i][t1 - 1] == input2[j1][t2 - 1])
			{
				temp.clear();
				temp.push_back(input1[i][0]);
				temp.push_back(input1[i][1]);
				temp.push_back(input1[i][2]);
				temp.push_back(input2[j1][0]);
				temp.push_back(input2[j1][1]);
				temp.push_back(input2[j1][2]);
				joinRes.push_back(temp);
				j1++;
			}
			int i1 = i + 1;
			while (i1 < size1 && input1[i1][t1 - 1] == input2[j][t2 - 1])
			{
				temp.clear();
				temp.push_back(input1[i1][0]);
				temp.push_back(input1[i1][1]);
				temp.push_back(input1[i1][2]);
				temp.push_back(input2[j][0]);
				temp.push_back(input2[j][1]);
				temp.push_back(input2[j][2]);
				joinRes.push_back(temp);
				i1++;
			}
			i++;
			j++;
		}
	}
	return joinRes;
}

void gropyBy_sort2(inputStruct& input, int t1, int t2, vector<dataType>& index)
{
	int size = input.size();
	sort(input.begin(), input.end(), [](vector<dataType>& fir, vector<dataType>& sec)
		{
			if (fir[1] < sec[1])
			{
				return true;
			}
			else if (fir[1] > sec[1])
			{
				return false;
			}
			else if (fir[1] == sec[1])
			{
				if (fir[4] == sec[4])
				{
					return false;
				}
				else if (fir[4] < sec[4])
				{
					return true;
				}
				else if (fir[4] > sec[4])
				{
					return false;
				}
			}
		});
	index.push_back(0);
	for (int i = 1;i < size;i++)
	{
		if ((input[i - 1][t1] != input[i][t1])
			|| input[i - 1][t2] != input[i][t2])
		{
			index.push_back(i);
		}
	}
}

int findMax2(int index1, int index2)
{
	int flag = res[index1][0];
	for (;index1 < index2;index1++)
	{
		if (flag < res[index1][0])
		{
			flag = res[index1][0];
		}
	}
	return flag;
}

int findMin2(int index1, int index2)
{
	int flag = res[index1][3];
	for (;index1 < index2;index1++)
	{
		if (res[index1][3] < flag)
		{
			flag = res[index1][3];
		}
	}
	return flag;
}

代码思路：

首先进行文件读取

然后是连接算法，这里采用了sort merge，并以小表为基准创建hash结构

Group操作:

这里并没有使用新的空间来存储group的结果，而是根据group字段进行排序之后，用另外的结构index来记录t1.id2,t2.id2都相等的一个范围，比如说排序之后，第一行到第三行的两个字段值都相等，那么就记录一个0,3表示这一组开始于0，结束于3

因为select和后面order by需要使用原始数据没有的字段，所以就检测出每一个组max(t1.id1)和min(t2.id1)，然后和t2.id2，t1.id2一起组合成新的数据，但都是使用之前的空间进行，不另外进行存储，把组合成的新数据挪到A的前面

这时已经得到了要输出的结果集，结果集存储在A中，范围是index.size()，然后对这部分数据按要求进行一次排序，之后输出即可得出最后结果。

亮点可能在于，没有开辟新的空间存储每一部分的结果集，而是使用了index进行范围的标注

巴塞罗那的风

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
4
评论
2020中兴捧月算法大赛埃德加考特派区域优胜奖源代码

说明：并非最终获奖版本，该版本最快时长为1100ms左右，中间有一些我当时分析各个操作时间占比的代码并没有注释掉代码思路：首先进行文件读取然后是连接算法，这里采用了sort merge
复制链接

扫一扫