2005年百度之星程序设计大赛试题初赛题目-题4

最新推荐文章于 2023-10-06 11:22:25 发布

windmissing

最新推荐文章于 2023-10-06 11:22:25 发布

阅读量1.3k

点赞数

分类专栏： AStart 文章标签：百度 iterator whitespace function iostream insert

本文链接：https://blog.csdn.net/mishifangxiangdefeng/article/details/7573982

版权

AStart 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

第四题（共四题 100 分）：低频词过滤（ 40 分）

题目描述：请编写程序，从包含大量单词的文本中删除出现次数最少的单词。如果有多
个单词都出现最少的次数，则将这些单词都删除。

输入数据：程序读入已被命名为 corpus.txt 的一个大数据量的文本文件，该文件包含英
文单词和中文单词，词与词之间以一个或多个 whitespace 分隔。（为便于调试，您可下载
测试 corpus.txt 文件，实际运行时我们会使用不同内容的输入文件。）

输出数据：在标准输出上打印删除了 corpus.txt 中出现次数最少的单词之后的文本（
词与词保持原来的顺序，仍以空格分隔）。

评分标准：程序输出结果必须正确，内存使用越少越好，程序的执行时间越快越好。

my answer：

吸取前面的教训，尽量考虑多种情况，能处理单个或多个空格，换行，单个或多个空行

最正常的算法，三遍循环：

第一遍，计算每个单词的频率。第二遍，求最小频率。第三遍，输出。

没想到什么时间或空间的优化

#include <iostream>
#include <fstream>
#include <string>
#include <map>
using namespace std;

map<string, int> M;
char c;
int main()  
{  
    M.clear();  
    ifstream fin, fin2;  
    fin.open("corpus.txt");   
    string word;  
    while (fin>>word)
		M[word]++;
	int min = 0x7fffffff;
	map<string,int>::iterator map_it = M.begin();
	while(map_it != M.end())
	{
		if(map_it->second < min)
			min = map_it->second;
		++map_it;
	}
	fin.close();
//	fin.seekg(0, ios_base::beg);
	fin2.open("corpus.txt");
	string str;
	while(c = fin2.get())
	{
		if(c == '\0' || c == '\t' || c == '\n' || c == ' ')
		{
			if(str != "" && M[str] > min)
				cout<<str;
			str = "";
			cout<<c;
		}
		else
			str += c;
	}
	return 0;
}

牛A陈世熹的比赛答题源码：

也是三遍循环，差不多

#include <iostream> 
#include <cstdio> 
#include <cstring> 
#include <utility> 
#include <map> 

using namespace std; 

struct word_less 
{ 
	bool operator()(char* A, char* B) const 
	{ 
		return strcmp(A, B) < 0; 
	} 
}; 

typedef map<char*, int, word_less> my_map; 

my_map Map; 
int N, Min; 
char Word[100001]; 

char* get_word() 
{ 
	char* P; 
	int Len; 
	Len = (int)strlen(Word); 
	P = new char[Len + 1]; 
	strcpy(P, Word); 
	return P; 
} 

int main() 
{ 
	my_map::iterator It; 
	freopen("corpus.txt", "r", stdin); 
	Map.clear(); 
	while (scanf("%s", Word) > 0) 
	{ 
		It = Map.find(Word); 
		if (It == Map.end()) 
			Map.insert(make_pair(get_word(), 1)); 
		else 
			It->second++; 
	} 
	Min = 0x7fffffff; 
	for (It = Map.begin(); It != Map.end(); ++It) if (It->second < Min) 
		Min = It->second; 
	N = 0; 
	freopen("corpus.txt", "r", stdin); 
	while (scanf("%s", Word) > 0) 
		if (Map[Word] > Min) 
		{ 
			if (N > 0) printf(" "); 
				N++; 
			printf("%s", Word); 
		} 
	printf("\n"); 
	return 0; 
}

牛B楼天城的比赛答题源码：

牛B的代码还是一如既往的难以理解，大概是用自己写的hash代替STL中的map

#include <stdio.h> 
#include <string.h> 
#include <unistd.h> 

const int bufsize=128*1024; 
int bufL; 
char buf[bufsize]; 

struct THashPoint 
{ 
	char *s; 
	int c; 
	THashPoint *next; 
}; 
int MemoryID=0; 
THashPoint **Hash,*Memory; 

char *text; 
int L,HashSize,minC; 

void ReadFile() 
{ 
	text=new char[bufsize+5]; 
	L=0; 
	int textL=bufsize+5; 
	while (1) 
	{ 
		bufL=read(0,buf,bufsize); 
		if (bufL==0) 
			break; 
		while (L+bufL>=textL) 
		{ 
			char *t_text=text; 
			textL*=2; 
			text=new char[textL]; 
			memcpy(text,t_text,L); 
		} 
		memcpy(text+L,buf,bufL); 
		L+=bufL; 
	} 
	text[L]=0; 
} 
bool Prime(int n) 
{ 
	for (int i=2;i*i<=n;i++) 
	if (n%i==0) 
		return false; 
	return true; 
} 
void Prepare() 
{ 
	int N=0,i; 
	for (i=0;i<L;i++) 
		if (text[i]==' ' || text[i]=='\t' || text[i]=='\n') 
			text[i]=0; 
	for (i=0;i<L;i++) 
		if ((i==0 || text[i-1]==0) && text[i]!=0) 
			N++; 
	for (HashSize=N*2+10;!Prime(HashSize);HashSize++); 
		Hash=new THashPoint* [HashSize]; 
	for (i=0;i<HashSize;i++) 
		Hash[i]=NULL; 
	MemoryID=0; 
	Memory=new THashPoint[N+10]; 
} 
int HashTable_function(char *s) 
{ 
	int address=strlen(s)%HashSize; 
	for (int i=0;s[i];i++) 
		address=(address*137+s[i]+128)%HashSize; 
	return address; 
} 
void HashTable_Insert(char *s) 
{ 
	int address=HashTable_function(s); 
	THashPoint *p; 
	for (p=Hash[address];p!=NULL;p=p->next) 
		if (strcmp(p->s,s)==0) 
		{ 
			p->c++; 
			return; 
		} 
	p=&Memory[MemoryID++]; 
	p->s=s; 
	p->c=1; 
	p->next=Hash[address]; 
	Hash[address]=p; 
} 
bool Print(char *s) 
{ 
	int address=HashTable_function(s); 
	THashPoint *p; 
	for (p=Hash[address];p!=NULL;p=p->next) 
		if (strcmp(p->s,s)==0 && p->c==minC) 
			return false; 
	return true; 
} 
void Solve() 
{ 
	int i; 
	for (i=0;i<L;i++) 
		if ((i==0 || text[i-1]==0) && text[i]!=0) 
			HashTable_Insert(text+i); 
	minC=2000000000; 
	for (i=0;i<MemoryID;i++) 
		if (Memory[i].c<minC) 
			minC=Memory[i].c; 
	bool first=true; 
	for (i=0;i<L;i++) 
		if ((i==0 || text[i-1]==0) && text[i]!=0 && Print(text+i)) 
		{ 
			if (!first) 
				printf(" "); 
			first=false; 
			printf("%s",text+i); 
		} 
} 
int main() 
{ 
	freopen("corpus.txt","r",stdin); 
	ReadFile(); 
	Prepare(); 
	Solve(); 
return 0; 
}