说明:
该版本最快时长为1100ms左右,中间有一些我当时分析各个操作时间占比的代码并没有注释掉
说明:
并非最终获奖版本,该版本最快时长为1100ms左右
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include<algorithm>
#include<functional>
#include<ctime>
#include<unordered_map>
using namespace std;
typedef int dataType;
//save csv struct
typedef vector<vector<dataType>> inputStruct;
void showInputStruct(const inputStruct& input);
// read csv file
inputStruct readCsv(const char*);
//sort-Merge
inputStruct joinCsv_sortMerge(inputStruct&, inputStruct&, int, int);
//gropyBy with sort
void gropyBy_sort2(inputStruct&, int, int, vector<dataType>&);
//计算每个group的max(t1.id1)
int findMax2(int, int);
//计算每个group的min(t2.id1)
int findMin2(int, int);
inputStruct res;
int main()
{
iostream::sync_with_stdio(false);
clock_t stread = clock();
//读取两个数据文件
inputStruct input1 = readCsv("/home/web/ztedatabase/myinput1.csv");
inputStruct input2 = readCsv("/home/web/ztedatabase/myinput2.csv");
clock_t endread = clock();
cout << "read time:" << (endread - stread) / CLOCKS_PER_SEC << endl;
clock_t stjoin = clock();
//sort-merge排序
res = joinCsv_sortMerge(input1, input2, 3, 3);
clock_t endjoin = clock();
cout << "join file time:" << (endjoin - stjoin) / CLOCKS_PER_SEC << endl;
input1.clear();
input2.clear();
vector<dataType> index(0);
clock_t stsort2 = clock();
//分组,index用来标记不同group的起始位置
gropyBy_sort2(res, 1, 4, index);
index.push_back(res.size());
clock_t endsort2 = clock();
cout << "group file time:" << (endsort2 - stsort2) / CLOCKS_PER_SEC << endl;
int size_index = index.size();
clock_t stzhuan = clock();
//复用原有空间
for (int i = 0;i < size_index - 1;i++)
{
int max = findMax2(index[i], index[i + 1]);
int min = findMin2(index[i], index[i + 1]);
res[i][0] = max;
res[i][1] = min;
res[i][2] = res[index[i]][4];
res[i][3] = res[index[i]][2];
}
clock_t endzhuan = clock();
cout << "zhuan time:" << (endzhuan - stzhuan) / CLOCKS_PER_SEC << endl;
clock_t stsort = clock();
//最后的sort
sort(res.begin(), res.begin() + size_index - 1, [](vector<dataType>& input1, vector<dataType>& input2) {
if (input1[0] < input2[0])
{
return true;
}
else if (input1[0] > input2[0])
{
return false;
}
else if (input1[0] == input2[0])
{
if (input1[2] < input2[2])
{
return true;
}
else if (input1[2] > input2[2])
{
return false;
}
else if (input1[2] == input2[2])
{
if (input1[3] < input2[3])
{
return true;
}
else if (input1[3] > input2[3])
{
return false;
}
else if (input1[3] == input2[3])
{
return false;
}
}
}
});
clock_t endsort = clock();
cout << "sort time:" << (endsort - stsort) / CLOCKS_PER_SEC << endl;
//输出结果
for (int i = 0;i < size_index - 1;i++)
{
cout<<res[i][0]<<","<<res[i][1]<<endl;
}
return 0;
}
inputStruct readCsv(const char* filename)
{
//读取每一行,根据逗号进行切分
inputStruct strArray;
FILE* file;
fopen_s(&file,filename, "r");
if (file)
{
char* buf;
int len;
fseek(file, 0, SEEK_END);
len = ftell(file);
rewind(file);
buf = (char*)malloc(sizeof(char) * len + 1);
if (!buf)
{
cout << "error in malloc.\n";
}
int rLen = fread(buf, sizeof(char), len, file);
buf[rLen] = '\0';
string strbuf = buf;
string line;
stringstream ss(strbuf);
stringstream ss2;
string temp;
vector<dataType> lineArray;
while (getline(ss, line, '\n'))
{
stringstream ss2(line);
while (getline(ss2, temp, ','))
{
lineArray.push_back(atoi(temp.c_str()));
}
strArray.push_back(lineArray);
lineArray.clear();
}
}
return strArray;
}
void showInputStruct(const inputStruct& input)
{
for (int i = 0;i < input.size();i++)
{
for (int j = 0;j < input[i].size();j++)
{
cout << input[i][j] << " ";
}
cout << endl;
}
}
inputStruct joinCsv_sortMerge(inputStruct& input1, inputStruct& input2, int t1, int t2)
{
//代码思路见本专栏另一篇博客
int size1 = input1.size();
int size2 = input2.size();
inputStruct joinRes(0);
sort(input1.begin(), input1.end(), [](vector<dataType>& input1, vector<dataType>& input2) {
return input1[2] < input2[2];
});
sort(input2.begin(), input2.end(), [](vector<dataType>& input1, vector<dataType>& input2) {
return input1[2] < input2[2];
});
int i = 0, j = 0;
vector<dataType> temp(0);
while (i < size1 && j < size2)
{
if (input1[i][t1 - 1] > input2[j][t2 - 1])
{
j++;
}
else if (input1[i][t1 - 1] < input2[j][t2 - 1])
{
i++;
}
else if (input1[i][t1 - 1] == input2[j][t2 - 1])
{
temp.clear();
temp.push_back(input1[i][0]);
temp.push_back(input1[i][1]);
temp.push_back(input1[i][2]);
temp.push_back(input2[j][0]);
temp.push_back(input2[j][1]);
temp.push_back(input2[j][2]);
joinRes.push_back(temp);
int j1 = j + 1;
while (j1 < size2 && input1[i][t1 - 1] == input2[j1][t2 - 1])
{
temp.clear();
temp.push_back(input1[i][0]);
temp.push_back(input1[i][1]);
temp.push_back(input1[i][2]);
temp.push_back(input2[j1][0]);
temp.push_back(input2[j1][1]);
temp.push_back(input2[j1][2]);
joinRes.push_back(temp);
j1++;
}
int i1 = i + 1;
while (i1 < size1 && input1[i1][t1 - 1] == input2[j][t2 - 1])
{
temp.clear();
temp.push_back(input1[i1][0]);
temp.push_back(input1[i1][1]);
temp.push_back(input1[i1][2]);
temp.push_back(input2[j][0]);
temp.push_back(input2[j][1]);
temp.push_back(input2[j][2]);
joinRes.push_back(temp);
i1++;
}
i++;
j++;
}
}
return joinRes;
}
void gropyBy_sort2(inputStruct& input, int t1, int t2, vector<dataType>& index)
{
int size = input.size();
sort(input.begin(), input.end(), [](vector<dataType>& fir, vector<dataType>& sec)
{
if (fir[1] < sec[1])
{
return true;
}
else if (fir[1] > sec[1])
{
return false;
}
else if (fir[1] == sec[1])
{
if (fir[4] == sec[4])
{
return false;
}
else if (fir[4] < sec[4])
{
return true;
}
else if (fir[4] > sec[4])
{
return false;
}
}
});
index.push_back(0);
for (int i = 1;i < size;i++)
{
if ((input[i - 1][t1] != input[i][t1])
|| input[i - 1][t2] != input[i][t2])
{
index.push_back(i);
}
}
}
int findMax2(int index1, int index2)
{
int flag = res[index1][0];
for (;index1 < index2;index1++)
{
if (flag < res[index1][0])
{
flag = res[index1][0];
}
}
return flag;
}
int findMin2(int index1, int index2)
{
int flag = res[index1][3];
for (;index1 < index2;index1++)
{
if (res[index1][3] < flag)
{
flag = res[index1][3];
}
}
return flag;
}
代码思路:
首先进行文件读取
然后是连接算法,这里采用了sort merge,并以小表为基准创建hash结构
Group操作:
这里并没有使用新的空间来存储group的结果,而是根据group字段进行排序之后,用另外的结构index来记录t1.id2,t2.id2都相等的一个范围,比如说排序之后,第一行到第三行的两个字段值都相等,那么就记录一个0,3表示这一组开始于0,结束于3
因为select和后面order by需要使用原始数据没有的字段,所以就检测出每一个组max(t1.id1)和min(t2.id1),然后和t2.id2,t1.id2一起组合成新的数据,但都是使用之前的空间进行,不另外进行存储,把组合成的新数据挪到A的前面
这时已经得到了要输出的结果集,结果集存储在A中,范围是index.size(),然后对这部分数据按要求进行一次排序,之后输出即可得出最后结果。
亮点可能在于,没有开辟新的空间存储每一部分的结果集,而是使用了index进行范围的标注