如何在有限的内存的情况下，找到大量数据中重复查询次数最多的语句

最新推荐文章于 2022-02-04 09:15:28 发布

ranyongqing

最新推荐文章于 2022-02-04 09:15:28 发布

阅读量2.2k

点赞数

分类专栏：大数据 C++ 文章标签：数据 iostream iterator hash string

本文链接：https://blog.csdn.net/ranyongqing/article/details/24499519

版权

C++ 同时被 2 个专栏收录

39 篇文章 0 订阅

订阅专栏

大数据

4 篇文章 0 订阅

订阅专栏

//500K内存，100万条数据，寻找里面重复查询次数最多的记录
//思想显示对100万条数据分在不同的文件中，但是在分文件中的时候，要对起进行一定处理，分到对于的文件中，比如分成100个
//文件，那么对每一条string进行hash得到的整数值%100，这样就分到不同的文件中了，然后确保其他文件中没有这一条记录，
//然后对每一个文件进行处理，采样hash值，然后存入相应的文件中。
//然后对每一个文件中数据进行快速排序，存入数据，这样保证数据最大的在前面。
//然后对所有文件的第一个数，进行排序，这样就能够得到最大的那条记录了。
#include <iostream>
#include <fstream>//wenjian
#include <string>
#include <sstream>//stringstream
#include <unordered_map>//undered_map
#include <algorithm>//sort()
#include <utility>//pair()
using namespace std;

//产生100万条数据，随机。保存在data.txt中
bool Generate_Data()
{
ofstream outfile("data.txt");
if(outfile.fail()){
cout<<"Open file failed!"<<endl;
return false;
}
int a = 0;
for(int i = 0; i < 1000000; ++i){
a = rand() % 10000;
outfile<<a<<endl;
}
outfile.close();
return true;
}

inline unsigned int RSHash(string &str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
int i = 0;
int size = str.size();
while (i < size)
{
hash = hash * a + str[i];
a *= b;
i++;
}
return (hash & 0x7FFFFFFF);
}
//按照这个规则排序快速排序

bool mysort(const pair<string,int> &x, const pair<string,int> &y)
{
return x.second > y.second;
}

//对100万行分别hash，然后根据hash值分50组
bool Split_Into_Groups(string &str)
{
ifstream infile(str);
if(infile.fail()){
cout<<"Open file failed!"<<endl;
return false;
}
ofstream outfile[50];
stringstream ss;
string name_num, file_name;
for(int i = 0; i < 50; ++i){
ss.clear();
ss<<i;
ss>>name_num;
file_name = name_num + ".txt";
outfile[i].open(file_name);
if(outfile[i].fail()){
cout<<"Open file failed"<<endl;
return false;
}
}
string tmp;
int hash_val;
while(getline(infile,tmp)){
hash_val = RSHash(tmp) % 50;
outfile[hash_val]<<tmp<<endl;
}
infile.close();
for(int i = 0; i < 50; ++i){
ss.clear();
ss<<i;
ss>>name_num;
file_name = name_num + ".txt";
outfile[i].close();
}
//文件去重；
for(int i = 0; i < 50; ++i){
ss.clear();
ss<<i;
ss>>name_num;
file_name = name_num + ".txt";
infile.open(file_name);
if(infile.fail()){
cout<<"Open file failed!"<<endl;
return false;
}
unordered_map<string,int>::const_iterator got;
unordered_map<string,int> word_num;
int k = 0;
while(getline(infile,tmp)){
k = 0;
got = word_num.find(tmp);
if(got == word_num.end()){
word_num.insert(make_pair<string,int>(tmp,1));

}
else
++word_num[tmp];
}
infile.close();
outfile[i].open(file_name);
if(outfile[i].fail()){
cout<<"Open file failed"<<endl;
return false;
}
//文件排序
vector<pair<string,int>> pair_vec;
for(unordered_map<string,int>::iterator it = word_num.begin(); it != word_num.end(); ++it){
pair_vec.push_back(make_pair<string,int> (it->first,it->second));
}
sort(pair_vec.begin(),pair_vec.end(),mysort);
for(vector<pair<string,int>>::iterator it = pair_vec.begin(); it != pair_vec.end(); ++it){
outfile[i]<<it->first<<" "<<it->second<<endl;
}
outfile[i].close();
}
return true;
}

int main()
{
//产生数据
if(!Generate_Data())
return EXIT_FAILURE;
//数据分组为50个组，并好排序
ifstream infile;
stringstream ss;
string name_num, file_name,tmp,tmp1;
int num;
ofstream outfile("result.txt");
string str("data.txt");
if(!Split_Into_Groups(str))
return EXIT_FAILURE;
//获取每个文件的第一个，然后排序，保存结果
vector<pair<string,int>> res;
for(int i = 0; i < 50; ++i){
ss.clear();
ss<<i;
ss>>name_num;
file_name = name_num + ".txt";
infile.open(file_name);
if(infile.fail()){
cout<<"Open file failed!"<<endl;
return false;
}
if(getline(infile,tmp)){
size_t found = tmp.find_last_of(" ");
if(found != string::npos){
tmp1 = tmp.substr(0,found);
num = atoi((tmp.substr(found+1)).c_str());
}
res.push_back(make_pair<string,int>(tmp1,num));
}
infile.close();
}
sort(res.begin(),res.end(),mysort);
for(vector<pair<string,int>>::iterator it = res.begin(); it != res.end(); ++it){
outfile<<it->first<<" "<<it->second<<endl;
}
outfile.close();
cin.get();
return 0;	
}