热门查询统计算法-CSDN博客

本文链接：https://blog.csdn.net/sankt/article/details/958648

/* 寻找热门查询：搜索引擎会通过日志文件把用户每次检索使用的所有检索串都记录下来，每个查询串的长度为1-255字节。假设目前有一千万个记录，这些查询串的重复度比较高，虽然总数是1千万，但如果除去重复后，不超过3百万个。一个查询串的重复度越高，说明查询它的用户越多，也就是越热门。请你统计最热门的10个查询串，要求使用的内存不能超过1G。（1）请描述你解决这个问题的思路；（2）请给出主要的处理流程，算法，以及算法的复杂度。
author:sankt.

我的程序基于一个简单的思想，读取log文件，每一行是一个搜索串
*/

#include<iostream>
#include<fstream>
#include<map>
#include<vector>
#include <utility>

using namespace std;
typedef vector< pair<string,int> > Newvector;

typedef pair<string,int> Newpair;

struct intCmp
{
    bool operator()( const Newpair& v1,const Newpair& v2 ) const
    {
      return v1.second > v2.second;
    }
};

int main()
{

    map<string,int> mapStr;
    ifstream fin("log.txt");
    if(fin == NULL)
    {
         cerr<<"The file was not opened."<<endl;
         exit(1);
    }
    string strTemp;
    map<string,int>::iterator ite;

    while(getline(fin,strTemp))
    {
        ite = mapStr.find(strTemp);
        if(ite != mapStr.end())
        {
             mapStr[strTemp]++;
        }
        else
        {
             mapStr.insert(make_pair(strTemp,1));
        }
    }

    Newvector vec;

    for(ite = mapStr.begin();ite != mapStr.end();++ite)
    {
         cout<<(*ite).first<<"   "<<(*ite).second<<endl;
         pair<string,int> pa( (*ite).first,(*ite).second );
         vec.push_back( pa );
    }
    cout<<endl;
    cout<<"===================After Sort==================="<<endl;
    cout<<endl;


    sort(vec.begin(),vec.end(),intCmp());

    Newvector::iterator vecite;
    for(vecite = vec.begin();vecite != vec.end();++vecite)
    {
         cout<<(*vecite).first<<"   "<<(*vecite).second<<endl;
    }
    system("pause");
    return 0;
}