6.从一大块文本中去掉重复的行
关于这道题的想法有四个:
第一,读入一个字符串与前面的每一个字符串比较,如果存在,删除,不存在,打印。时间复杂度O(n*n*K),K为比较字符串的时间。代码如下:
//the program used to trick the text which have duplicate line
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
using namespace std;
ifstream fin("input.txt");
ofstream fout("output.txt");
vector<string> lines;
//read the text into the memory
void readText()
{
string line;
while (getline(fin, line))
{
lines.push_back(line);
}//end while
}
//remove the duplicate line
void removeduplicate()
{
fout << lines[0] << endl;
for (int i=1; i < (int)lines.size(); i ++)
{
bool flag = true;
string str1 = lines[i];
string str2;
for(int j=0; j < i; j ++)
{
str2 = lines[j];
if (str1 == str2)
{
flag = false;
break;
}
}//end for j
if (flag)
{
fout << str1 << endl;
}
}//end for i
}
int main()
{
readText();
removeduplicate();
return 0;
}
第二种想法:读入数据,对数据行进行排序,可以使用快排,然后从第二行开始遍历,与前一行比较,如果相同,则跳过,不同则打印。算法时间复杂度为O(n*log(n) *K)。伪代码如下:
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
ifstream fin("input.txt");
ofstream fout("output.txt");
vector<string> lines;
//read the lines from the text
void readText()
{
string line;
while(getline(fin, line))
{
lines.push_back(line);
}//end while
}
//remove the duplicate by sort the vector then get the answer
void removeduplicate()
{
sort(lines.begin(), lines.end());
fout << lines[0] << endl;
for (int i=1; i <(int)lines.size(); i ++)
{
if (lines[i] != lines[i-1])
{
fout << lines[i] << endl;
}
}
}
int main()
{
readText();
removeduplicate();
return 0;
}
第三种想法:采用Tri树的想法,读入数据,构建一棵tri树,利用tri树进行排重,然后再把tri树打印出来。时间复杂度O(n * K)。K为字符串的长度。代码如下:
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
ifstream fin("input.txt");
ofstream fout("output.txt");
struct Character
{
char c;
bool end;
Character *next[256];
};
Character Tree;
char * line = new char[100];
//read the line and generate the Tri Tree
void generateTriTree()
{
string str;
while (getline(fin,str))
{
Character * tmp = &Tree;
for (int i=0; i<str.size(); i++)
{
if (tmp->next[str[i]] == NULL)
{
tmp->next[str[i]] = new Character();
tmp->next[str[i]]->c = str[i];
}
tmp = tmp->next[str[i]];
}
tmp->end = true;
}//while
}
//print the tree
void printTriTree(Character* tree, int index)
{
if(tree == NULL)
return;
line[index] = tree->c;
//文本的结尾
if (tree->end)
{
line[index+1] = '\0';
fout <<line << endl;
}
for (int i=0; i < 256; i ++)
{
printTriTree(tree->next[i], index+1);
}
}
//release the source
void release(Character* tree)
{
if (tree == NULL)
{
return;
}
for (int i=0; i < 256; i ++)
{
release(tree->next[i]);
}//end for
delete tree;
}
int main()
{
generateTriTree();
for (int i=0; i < 256; i ++)
{
printTriTree(Tree.next[i], 0);
}
for (int i=0; i < 256; i ++)
{
release(Tree.next[i]);
}
fin.close();
fout.close();
delete [] line;
return 0;
}
第四种方法:采用Hash/set进行排重,读入一个数据,放入hash/set中,如果不存在则打印出来,存在则不进行打印。时间复杂度为O(n*K),K为Hash过程损耗的时间。代码如下:
#include <iostream>
#include <fstream>
#include <set>
#include <string>
using namespace std;
set<string> lineset;
ifstream fin("input.txt");
ofstream fout("output.txt");
//use the set to remove the duplicate
void readText()
{
string str;
while(getline(fin, str))
{
lineset.insert(str);
}
}
void print()
{
set<string>::iterator it;
for (it = lineset.begin(); it != lineset.end(); it ++)
{
fout << *it << endl;
}
}
int main()
{
readText();
return 0;
}