想成为Google工程师？先回答这15个面试问题【这只是一必要条件】（五）

最新推荐文章于 2022-01-28 16:36:08 发布

无能所以愤怒

最新推荐文章于 2022-01-28 16:36:08 发布

阅读量1.1k

点赞数

分类专栏：化茧成蝶文章标签： google 面试 character tree string iterator

本文链接：https://blog.csdn.net/zengniao/article/details/7283809

版权

化茧成蝶专栏收录该内容

18 篇文章 0 订阅

订阅专栏

6.从一大块文本中去掉重复的行

关于这道题的想法有四个：

第一，读入一个字符串与前面的每一个字符串比较，如果存在，删除，不存在，打印。时间复杂度O（n*n*K)，K为比较字符串的时间。代码如下：

//the program used to trick the text which have duplicate line
#include <iostream>
#include <fstream>
#include <string>
#include <vector>

using namespace std;

ifstream fin("input.txt");
ofstream fout("output.txt");

vector<string> lines;

//read the text into the memory
void readText()
{
	string line;

	while (getline(fin, line))
	{
		lines.push_back(line);
	}//end while
}

//remove the duplicate line
void removeduplicate()
{
	fout << lines[0] << endl;
	for (int i=1; i < (int)lines.size(); i ++)
	{
		bool flag = true;
		string str1 = lines[i];
		string str2;
		for(int j=0; j < i; j ++)
		{
			str2 = lines[j];
			if (str1 == str2)
			{
				flag = false;
				break;
			}
		}//end for j

		if (flag)
		{
			fout << str1 << endl;
		}
	}//end for i
}

int main()
{
	readText();

	removeduplicate();
	return 0;
}

第二种想法：读入数据，对数据行进行排序，可以使用快排，然后从第二行开始遍历，与前一行比较，如果相同，则跳过，不同则打印。算法时间复杂度为O（n*log(n) *K)。伪代码如下：

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>

using namespace std;

ifstream fin("input.txt");
ofstream fout("output.txt");

vector<string> lines;

//read the lines from the text
void readText()
{
	string line;
	while(getline(fin, line))
	{
		lines.push_back(line);
	}//end while
}

//remove the duplicate by sort the vector then get the answer
void removeduplicate()
{
	sort(lines.begin(), lines.end());
	
	fout << lines[0] << endl;
	for (int i=1; i <(int)lines.size(); i ++)
	{
		if (lines[i] != lines[i-1])
		{
			fout << lines[i] << endl;
		}
	}
}

int main()
{
	readText();
	removeduplicate();

	return 0;
}

第三种想法：采用Tri树的想法，读入数据，构建一棵tri树，利用tri树进行排重，然后再把tri树打印出来。时间复杂度O（n * K）。K为字符串的长度。代码如下：

#include <iostream>
#include <fstream>
#include <string>

using namespace std;

ifstream fin("input.txt");
ofstream fout("output.txt");


struct Character
{
	char c;
	bool end;
	Character *next[256]; 
};

Character Tree;
char * line = new char[100];

//read the line and generate the Tri Tree
void generateTriTree()
{
	string str; 

	while (getline(fin,str))
	{
		Character * tmp = &Tree;
		for (int i=0; i<str.size(); i++)
		{
			if (tmp->next[str[i]] == NULL)
			{
				tmp->next[str[i]] = new Character();
				tmp->next[str[i]]->c = str[i];
			}
			tmp = tmp->next[str[i]];
		}
		tmp->end = true;
	}//while
}

//print the tree
void printTriTree(Character* tree, int index)
{
	if(tree == NULL)
		return;

	line[index] = tree->c;
	//文本的结尾
	if (tree->end)
	{
		line[index+1] = '\0';
		fout <<line << endl;
	}

	for (int i=0; i < 256; i ++)
	{
		printTriTree(tree->next[i], index+1);
	}
}

//release the source
void release(Character* tree)
{
	if (tree == NULL)
	{
		return;
	}

	for (int i=0; i < 256; i ++)
	{
		release(tree->next[i]);
	}//end for
	
	delete tree;	
}

int main()
{
	generateTriTree();

	for (int i=0; i < 256; i ++)
	{
		printTriTree(Tree.next[i], 0);
	}

	for (int i=0; i < 256; i ++)
	{
		release(Tree.next[i]);
	}	
	fin.close();
	fout.close();
	delete [] line; 
	return 0;
}

第四种方法：采用Hash/set进行排重，读入一个数据，放入hash/set中，如果不存在则打印出来，存在则不进行打印。时间复杂度为O（n*K），K为Hash过程损耗的时间。代码如下：

#include <iostream>
#include <fstream>
#include <set>
#include <string>

using namespace std;

set<string> lineset;

ifstream fin("input.txt");
ofstream fout("output.txt");

//use the set to remove the duplicate
void readText()
{
	string str;

	while(getline(fin, str))
	{
		lineset.insert(str);			
	}
}

void print()
{
	set<string>::iterator it;
	for (it = lineset.begin(); it != lineset.end(); it ++)
	{
		fout << *it << endl;
	}
}


int main()
{
	readText();
	return 0;
}

无能所以愤怒

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
想成为Google工程师？先回答这15个面试问题【这只是一必要条件】（五）

6.从一大块文本中去掉重复的行关于这道题的想法有四个：第一，读入一个字符串与前面的每一个字符串比较，如果存在，删除，不存在，打印。时间复杂度O（n*n*K)，K为比较字符串的时间。代码如下：//the program used to trick the text which have duplicate line#include #include #include #
复制链接

扫一扫