百度面试题query的分类排序

最新推荐文章于 2022-05-29 08:16:48 发布

世纪殇

最新推荐文章于 2022-05-29 08:16:48 发布

阅读量2.6k

点赞数

本文链接：https://blog.csdn.net/dasgk/article/details/23783073

版权

5、有10个文件，每个文件1G，每个文件的每一行都存放的是用户的query，每个文件的query都可能重复。如何按照query的频度排序？
回答：
　1）读取10个文件，按照hash(query)%10的结果将query写到对应的文件中。这样我们就有了10个大小约为1G的文件。任意一个query只会出现在某个文件中。
　2）对于1）中获得的10个文件，分别进行如下操作
　　-利用hash_map（query，query_count）来统计每个query出现的次数。
　　-利用堆排序算法对query按照出现次数进行排序。
　　-将排序好的query输出的文件中。
　　这样我们就获得了10个文件，每个文件中都是按频率排序好的query。

　3）对2）中获得的10个文件进行归并排序，并将最终结果输出到文件中。

解题思路：

当然这是从网上拿下来的，但是，我得自己理一遍，这以后才是自己的东西，我的理解是，可能这10个文件中，有相同的query包含在同一个文件当中，我们首先要做的是，将同一类的query放在同一个文件当中，这样至少，同一个query是在一个文件当中，这样我们才好统计query的出现次数

将query的出现次数统计下来，放在内存当中，hash_map(query,query_count);将hash_map的query_count初始化成0，如果查找到了，query_count自增，没有查找到，则插入该query，完成该hash_map的构建。然后进行归并排序。

#include "stdafx.h"       
#include<iostream>       
#include<fstream>       
#include<Windows.h>       
#include<hash_map>  
using namespace std;     
/*
struct CharLess : public binary_function<const char*, const char*, bool>    
{    
public:    
	result_type operator()(const first_argument_type& _Left, const second_argument_type& _Right) const    
	{    
		return(strcmp(_Left, _Right) < 0 ? true : false);    
	}    
};    */
struct CharLess   
{    
public:    
	//一般而言，我们并不希望，在比较过程中数据发生变化，所以使用const限定
	bool operator()(const char* _Left, const char* _Right)   const  
	{    
		return(strcmp(_Left, _Right) !=0 ? true : false);    
	}    
};  
//根据字符串获得哈希值  
int getHash(char* strQuery)  
{  
	int nIndex=0;  
	for(int i=0;i<strlen(strQuery);i++)  
	{  
		nIndex+=strQuery[i];  
	}  
	return nIndex%10;     
}  
//获取每一个文件中的查询字符串  
char* getQuery(HANDLE hFile,int& nOffset)    
{    
	//首先移动文件指针    
	int previous=nOffset;    
	DWORD midVal=0;    
	SetFilePointer(hFile,nOffset,NULL,FILE_BEGIN);    
	char tmp='\0';    
	while(tmp!='\n')    
	{    

		ReadFile(hFile,&tmp,1,&midVal,NULL);    
		if(midVal==0)    
		{    
			return NULL;    
		}    
		nOffset++;          
	}    
	//开始读取文件一行内容    
	SetFilePointer(hFile,previous,NULL,FILE_BEGIN);    
	char* strContent=new char[nOffset-previous+1];    
	memset(strContent,0,nOffset-previous+1);    
	ReadFile(hFile,strContent,nOffset-previous,&midVal,NULL);    
	return strContent;  
}    
//将分类整理之后的query写入文件  
void WriteToFile(char* strQuery,int nMark)  
{  
	char strFileName[20]={0};  
	sprintf(strFileName,"result%d.txt",nMark);  
	HANDLE hFile=CreateFileA(strFileName,FILE_ALL_ACCESS,FILE_SHARE_READ,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);  
	if(hFile==INVALID_HANDLE_VALUE)  
	{  
		hFile=CreateFileA(strFileName,FILE_ALL_ACCESS,FILE_SHARE_READ,NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL);     
	}  
	DWORD nFileSize=GetFileSize(hFile,NULL);  
	//移动文件指针  
	SetFilePointer(hFile,nFileSize,NULL,FILE_BEGIN);  
	int nStrLen=strlen(strQuery);  
	DWORD midVal;  
	WriteFile(hFile,strQuery,nStrLen,&midVal,NULL);   
	CloseHandle(hFile);  
}  

int main()    
{    
	//首先依次读取10个文件内容，然后，根据读取到的内容进行分配,双层循环，外层循环是一次读取不同的文件，内层循环是依次读取每一行    
	char strFileName[50]={0};  
	hash_map<  const char* ,int, hash_compare< const char*, CharLess>> hashInstance;  
	for(int i=0;i<2;i++)    
	{    
		memset(strFileName,0,50);    
		sprintf(strFileName,"test%d.txt",i);    
		HANDLE hFile=CreateFileA(strFileName,FILE_ALL_ACCESS,FILE_SHARE_READ,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);    
		if(hFile==INVALID_HANDLE_VALUE)    
		{    
			cout<<"文件打开失败:"<<GetLastError()<<endl;    
			return 0;    
		}    
		//文件打开成功，现在开始读取文件内容    
		int nOffset=0;    
		char * strQuery=getQuery(hFile,nOffset);    
		while(strQuery!=NULL)    
		{    
			//在此获得hash值，然后根据hash值，写入文件                
			int nHashValue=getHash(strQuery);  
			WriteToFile(strQuery,nHashValue);  
			delete[] strQuery;  
			strQuery=getQuery(hFile,nOffset);   
		}    
		CloseHandle(hFile);    
	}    
	//到这一步，我们已经将查询字符串分类整理好了，下一步就应该是读取并计算了     
	for(int i=0;i<10;i++)  
	{  
		sprintf(strFileName,"result%d.txt",i);  
		HANDLE hFile=CreateFileA(strFileName,FILE_ALL_ACCESS,FILE_SHARE_READ,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);   
		if(hFile!=INVALID_HANDLE_VALUE)  
		{  
			int nOffset=0;  
			char* strQuery=getQuery(hFile,nOffset);  
			while(strQuery!=NULL)  
			{                 
				if(hashInstance.find(strQuery)!=hashInstance.end())  
				{  
					hashInstance[strQuery]++;  

				}  
				else  
				{  
					hashInstance.insert(make_pair(strQuery,1));  
				}                 
				//  delete[] strQuery;            
				strQuery=getQuery(hFile,nOffset);  
			}  

		}  
		CloseHandle(hFile);       
	}  
	//输出统计  
	hash_map<const char*,int>::iterator it=hashInstance.begin();  
	for(;it!=hashInstance.end();it++)  
	{         
		cout<<it->first<<":"<<it->second<<endl;  
		delete[] it->first;  
	}  
}