海量数据，top100热词，手把手教学

#define MAX 100000
#define MIN 1000

int generate()
{
	ofstream ofile;//写文件
	ofile.open("data.txt", ios::out);//创建并打开名为"data.txt"的文件
//生成随机数的操作：
	clock_t start_time = clock();//计时开始
	srand(unsigned(time(0)));//生成时间种子
	
    int i = 0;
	for (int i = 0; i < 10000000; ++i) {

		unsigned long data = rand() % (MAX - MIN + 1) + MIN;//较标准：生成大小为MAX-MIN的随机数
		ofile << data << endl; //输出数据到data.txt 
	}
	ofile.close();//关闭文件
	return 0;
}

1.1.写文件的操作

    ofstream ofile;//第一步声明
	ofile.open("data.txt", ios::out);//第二部创建并打开名为"data.txt"的文件，并以ios::out的方式打开
	ofile << data << endl; //输出数据到data.txt 
	ofile.close();//关闭文件

1.2.读文件的操作

见后面

1.3.随机生成数的操作

    #define MAX 。。。。//自己定义
    #define MIN 。。。。//自己定义
	clock_t start_time = clock();//计时开始
	srand(unsigned(time(0)));//生成时间种子
    unsigned long data = rand() % (MAX - MIN + 1) + MIN;//较标准：生成大小为MAX-MIN的随机数

2.把data.txt文件分散为500个子文件

int spit(int num)
{
	
	ifstream file;// 读文件，大数据来源  
	file.open("data.txt", ios::in);//读的方式打开data.txt
	if (!file.is_open())//如果打开失败，返回。。。
		{
           cout<<"打开失败";
           return 0;
        }  



	ofstream* ofile = new ofstream[num + 1]; // 写入，预处理存储num个小文件，num可以自定义，这里设置500 



    int i = 0;
    //批量打开500个文件
	for (i = 0; i < num; i++)
    { 
        //to_string()是把数字转换成string类型，"1"+".txt"="1.txt","ha"+"ppy"="happy"
		string filename = to_string(i) + ".txt"; 
		ofile[i].open(filename, ios::out);
	}


    //写文件
	i = 0;
    hash<string> str_hash; // 哈希函数，用于记录哈希值
	std::string strLine;//std::string是一个类，猜测和string strLine一个意思
	while (getline(file, strLine))// 读入大数据文件每一行  （String strLine）
	{
		i++;
		if (strLine.empty())
			continue;
		int ton = str_hash(strLine) % num; // 哈希计算分块位置 ,哈希值%num
		ofile[ton] << strLine << endl;//在选中的小文件中写入当前在大文件读到的数据
		if (i % 10000 == 0)
        {
			cout << i << endl; // 每写入10000行输出一次，进度说明
		}
	}
	file.close();//关闭文件
    //批量关闭文件
	for (i = 0; i <= num; i++)
    {
		ofile[i].close();//把每个小文件的写都关闭掉
	}
	delete[] ofile;//前面new出来的记得delete
}

3.对每一个子文件进行（降序）排序，并将前100的数据分别放到“(top ten)1.txt" "top(top ten)2.txt"......"(top ten)500.txt"文件中

typedef pair<string, int> PAIR;
bool cmp_by_value(const PAIR& lhs, const PAIR& rhs) {
	return lhs.second > rhs.second;//lhs.second>rhs.second即左大于右，降序
                                   //lhs.second<rhs.second就左小于右，升序
}

int get100(int num)
{
	for (int q = 1; q < num; q++)//500是文件数，假设分成了500个文件
	{
		ifstream file; // 建立小文件对象 读
		ofstream ofile; // 建立小文件对象 写
		string filename = to_string(q) + ".txt"; // 文件来向和去向设定 to_string的作用是把数值改为字符串格式
		file.open(filename, ios::in);
		ofile.open("前一百" + filename, ios::out);
		if (!file.is_open())
			break;
		string strLine;//用于读文件
        map<string, int> hm;//存储key关键字和num频率 <string,int>分别对应数据key和频率num
		while (getline(file, strLine)) {
			if (strLine.empty())
				continue;
			hm[strLine] += 1; //map 统计频率 比如数据"1234"出现了一次，那么就hm["1234"]+=1;
		}
		cout << "*****" << endl;
		vector<PAIR> vec(hm.begin(), hm.end());//这里是把哈希表中的数据放进vector，留意这个PAIR，上面已经定义了
		sort(vec.begin(), vec.end(), cmp_by_value);//这里是排序算法 头文件要包含#include <algorithm>  排序后的结果是降序的 具体原因看PAIR的定义
		cout << "***" << endl;
		int length = 0;
		vector<PAIR>::iterator it;//vector的迭代器
		for (it = vec.begin(); it != vec.end(); it++)//it=vec.begin()即指向vec的第一个数据
                                                     //vec.end()同理 
		{
			length++;
		}
		int i = 0;
		while (true) // 输出所有的数据，为什么不是全部前100的数据，因为一个小文件的数据太少了，远达不到100个，这个时候还试图取前一百个例如调用vec[100]就会造成越界  
		{
			cout << i << "   " << vec[i].first << "  " << vec[i].second << endl;
			ofile << vec[i].first << "," << vec[i].second << endl;
			//if (i >= 100 && vec[i].second != vec[i + 1].second) // 保证100之后是否存在和100相同的数据  （这是取前100的时候才用的判断）
			if (i == length - 1)
				break;
			i++;
		}
		file.close();//文件在最后一定要记得关闭
		ofile.close();//文件在最后一定要记得关闭
	}
	return 0;
}

3.1.打开需要读和写的文件

        ifstream file; // 建立小文件对象 读
		ofstream ofile; // 建立小文件对象 写
		string filename = to_string(q) + ".txt"; // 文件来向和去向设定 to_string的作用是把数值改为字符串格式
		file.open(filename, ios::in);
		ofile.open("前一百" + filename, ios::out);
		if (!file.is_open())
			break;

3.2.把子文件的所有数据放入map中，记录数据key和num

        string strLine;//用于读文件
        map<string, int> hm;//存储key关键字和num频率 <string,int>分别对应数据key和频率num
		while (getline(file, strLine)) {
			if (strLine.empty())
				continue;
			hm[strLine] += 1; //map 统计频率 比如数据"1234"出现了一次，那么就hm["1234"]+=1;
		}

3.3.把map中的数据放入vector中，为排序做准备

vector<PAIR> vec(hm.begin(), hm.end());//这里是把哈希表中的数据放进vector，留意这个PAIR，上面已经定义了

3.4.对vector进行排序

sort(vec.begin(), vec.end(), cmp_by_value);//这里是排序算法 头文件要包含#include <algorithm>  排序后的结果是降序的 具体原因看PAIR的定义

3.5.利用迭代器统计vector的长度

        int length = 0;
		vector<PAIR>::iterator it;//vector的迭代器
		for (it = vec.begin(); it != vec.end(); it++)//it=vec.begin()即指向vec的第一个数据
                                                     //vec.end()同理 
		{
			length++;
		}

3.6.把子文件出现频率为前面的数据存入子文件的子文件中

        int i = 0;
		while (true) // 输出所有的数据，为什么不是全部前100的数据，因为一个小文件的数据太少了，远达不到100个，这个时候还试图取前一百个例如调用vec[100]就会造成越界  
		{
			cout << i << "   " << vec[i].first << "  " << vec[i].second << endl;
			ofile << vec[i].first << "," << vec[i].second << endl;
			//if (i >= 100 && vec[i].second != vec[i + 1].second) // 保证100之后是否存在和100相同的数据  （这是取前100的时候才用的判断）
			if (i == length - 1)
				break;
			i++;
		}

3.7.关闭3.1打开的文件

        file.close();//文件在最后一定要记得关闭
		ofile.close();//文件在最后一定要记得关闭

4.把所有“(top ten)1.txt" "top(top ten)2.txt"......"(top ten)500.txt"文件集合到”all.txt"文件中

    int getALl(int& num) {
	ofstream ofile;
	ofile.open("all.txt", ios::out);
	for (int q = 0; q < 500; q++) {
		ifstream file;
		string filename = "（前一百）" + to_string(q) + ".txt";
		file.open(filename, ios::in);
		if (!file.is_open())
		{
			cout << "打开失败" << endl;
		}
		else
			cout << "打开成功" << endl;
		string strLine;
		while (getline(file, strLine))
		{
			if (strLine.empty())
				continue;
			ofile << strLine << endl;
			num += 1;
		}
		file.close();
	}
	ofile.close();
	return 0;
}

5.对“all.txt"文件进行排序，排序后把top100热词输出到“last.txt"文件中

 int last(){
   ifstream file;
	file.open("all.txt", ios::in);
	ofstream ofile;
	ofile.open("last.txt",ios::out);
	map<string,double> hm;//存储key关键字和num频率
	string strLine;
	while (getline(file, strLine)) {
		if (strLine.empty())
		{
			cout << "继续啊" << endl;
			continue;
		}
		char save[20];
		strcpy_s(save, strLine.c_str());//把读到的数据放到string数组里面
		char* token = NULL;
		char* ptr = NULL;
		token = strtok_s(save, ",", &ptr);
		char* a = token;//key
		string change = a;
		token = strtok_s(NULL, ",", &ptr);
		double b = atof(token);//num
		cout << "key:" << change << "num:" << b << endl;
		hm[change] += b; // unordered_map 统计频率
	}
	vector<YYDS> vec(hm.begin(), hm.end());
	sort(vec.begin(), vec.end(), cmp_by_value);
	cout << "***" << endl;
	int length = 0;
	vector<YYDS>::iterator it;
	for (it = vec.begin(); it != vec.end(); it++)
	{
		cout << it->first << "    " << it->second << endl;
		length++;
	}
	int i = 0;
	while (true) // 输出前一百的数据  
	{
		cout << i << "   " << vec[i].first << "  " << vec[i].second << endl;
		ofile << vec[i].first << "," << vec[i].second << endl;
		if (i >= 100 && vec[i].second != vec[i + 1].second)break; // 保证100之后是否存在和100相同的数据  
		i++;
		
	}
	file.close();
	ofile.close();
}

5.1.分割字符串

string strLine;
while (getline(file, strLine)) {
		if (strLine.empty())
		{
			cout << "继续啊" << endl;
			continue;
		}
		char save[20];
		strcpy_s(save, strLine.c_str());//把读到的数据放到string数组里面
		char* token = NULL;
		char* ptr = NULL;
		token = strtok_s(save, ",", &ptr);
		char* a = token;//key
		string change = a;
		token = strtok_s(NULL, ",", &ptr);
		double b = atof(token);//num
		cout << "key:" << change << "num:" << b << endl;
		hm[change] += b; // unordered_map 统计频率
	}

all.txt文件的文件如上图所示，左边是数据key，右边是频率num

注意：中间还有","

读文件的时候是一行一行读的，所以我们要分割字符串才可以分别记录key和num

这时候利用strtok_s();这个函数

Rosy15

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
海量数据，top100热词，手把手教学

题目内容（1）大数据分析问题[问题描述]某搜索公司一天的用户搜索词汇是海量的（百亿数据量），请设计一种求出每天最热top 100 词汇的可行办法。[基本要求]（1）随机生成海量数据，存入文件；从文件读入数据来处理。（2）显示数据文件的每一次处理结果。 ...
复制链接

扫一扫