大数据处理例之STL实战练习

5 篇文章 0 订阅
3 篇文章 0 订阅

找出3000000条字符串中使用频率最高的10条

要求:高效

目的:锻炼STL的用法,涉及到STL各大类组件使用,包括自定义hash索引类,自定义hash函数,STL堆算法,C++类的设计方法等

主要用到hash_map和heap

生成原始数据脚本(matlab文件):

base = 97;
alpha = 26;
total = 3000000;
max_len = 4;
min_len = 1;
filename = 'data.txt';
fid = fopen(filename,'w+');

for n=1:total
    seed=mod(floor(rand()*100),max_len)+min_len;
    for m=min_len:seed
        fprintf(fid,char(mod(floor(rand()*100),alpha)+base));
    end
    fprintf(fid,'\n');
end
fclose(fid);
生成的数据文件大致为data.txt:
z
cvyh
kf
pjoh
rdr
cukv
ba
...
STL C++求解实现:
#include <functional>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <string>
#include <hash_map>

using namespace std;
using namespace stdext;

class Node
{
public:
	Node(){_str="";};
	Node(const Node &n){_str = n.getv();};
	Node(const std::string & str):_str(str){};
	Node(const char * pstr):_str(pstr){};
	~Node(){};

	std::string  getv() const
	{
		return _str;
	}

	void setv(const std::string & str)
	{
		_str = str;
	}

	Node & operator = (const Node & m)
	{
		_str = m.getv();
		return *this;
	}

	friend bool operator < (const Node &n,const Node &m)
	{
		return n._str < m._str;
	}

	friend bool operator == (const Node &n,const Node &m)
	{
		return n._str == m._str;
	}

	friend ostream & operator << (ostream& o,const Node&n)
	{
		o<<n.getv();
		return o;
	}
protected:
private:
	std::string _str;
};

inline size_t node_hash_value(const Node & n)
{
	return stdext::hash_value(n.getv().c_str());
}

struct node_hash_compare:public hash_compare<Node>
{
	size_t operator ()(const Node &n)const
	{
		return node_hash_value(n);
	}

	bool operator () (const Node & n, const Node &m)const
	{
		return n.getv() < m.getv();
	}
};

ostream & operator << (std::ostream& o,const pair< Node,int> &n )
{
	o<<n.first<<":"<<n.second;
	return o;
}

struct node_hash_greater
{
	bool operator()(const pair<Node,int> &m,const pair<Node,int> &n)
	{
		return m.second > n.second;
	}
};

const char *filename = "data.txt";
const int find_num = 10;

int main()
{
	ifstream out;
	hash_map<Node,int,node_hash_compare> nodes;

	out.open(filename, ios::in);
	std::string line;  
	while(!out.eof())  
	{  
		std::getline(out,line);
		if (line.size() == 0)
		{
			continue;
		}
		Node n(line);
		if (nodes.count(n) > 0)
		{
			nodes[n] = nodes[n] + 1;
		}
		else
		{
			nodes[n] = 1;
		} 
	}

	//copy(nodes.begin(),nodes.end(),std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));
	out.close();  

	pair<Node,int>  A[find_num+1];
	if (nodes.size() < find_num)
	{
		copy(nodes.begin(),nodes.end(),std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));
		return 0;
	}

	hash_map<Node,int,node_hash_compare>::iterator it = nodes.begin();
	int s=0;
	for (;s<find_num;s++,it++)
	{
		A[s].first = it->first;
		A[s].second = it->second;
	}

	make_heap(A, A + find_num,node_hash_greater());
	cout<<"\nheap start:\n";
	copy(A, A + find_num,std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));

	for (;it != nodes.end();it++)
	{
		if (it->second > A[0].second)
		{
			A[find_num].first = it->first;
			A[find_num].second = it->second;
			push_heap(A,A+find_num+1,node_hash_greater());
			pop_heap(A,A+find_num+1,node_hash_greater());
		}

	}

	sort_heap(A, A + find_num,node_hash_greater());
	cout<<"\nresult:\n";
	copy(A, A + find_num,std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));
	return 0;	
}
 

结果:



  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值