网上有很多类似“百度狗” 的元搜索引擎,他们抓取百度和google的搜索结果,然后合并在一个网页里。
且不说这样做有什么意义,但是这的确是一件好玩的事,自己想了一下原理,写了这样一个类,可以抓取百度里的搜索结果。
原理很简单:
- 通过搜索关键和页码字进入百度,例如“http://www.baidu.com/s?wd=搜索关键字&pn=页码(这里的页码计算公式是:10*(页码-1))
- 找到真正有用的那一行,把信息读出来,
- 按照每一个项的特点把他们切开,例如URL前面都有“class=f”……
- 然后存起来
下面是代码:
package
org.qisou;
import java.io. * ;
import java.net. * ;
/**/ /*
* 这是一个获取百度的类,它可以用一个搜索关键字来初始化,然后将抓取的结果存到SearchItem的对象数组里
*/
public class CatchBaidu ... {
/** *//**
* @param args
* strKey: 搜索关键字
*/
private String strKey;
private String[] info;
private String source;
private SearchItem[] ItemList;
/**//*
* 构造函数,使用搜索关键字构造
*/
public CatchBaidu(String strKey,int pages)...{
this.strKey=strKey;
pages=(pages-1)*10;
this.ItemList = new SearchItem[10];
this.source="http://www.baidu.com/s?wd="+this.strKey+"&tn=cfish828_pg&pn="+pages;
this.source=Socket(this.source);
info = new String[40];
this.GetBaidu();
}
public CatchBaidu(String strKey)...{
this.strKey=strKey;
this.ItemList = new SearchItem[10];
this.source="http://www.baidu.com/s?wd="+this.strKey+"&tn=cfish828_pg&pn="+1;
this.source=Socket(this.source);
info = new String[40];
this.GetBaidu();
}
/**//*
* 使用socket获取制定baidu页面,并对页面进行初级筛选,找出有用的一行返回
*/
public String Socket(String strPage)...{
String strServer="www.baidu.com";
String s=null;
try ...{
String hostname = strServer;
int port = 80;
InetAddress addr = InetAddress.getByName(hostname);
Socket socket = new Socket(addr, port); //建立一个Socket
//发送命令
BufferedWriter wr = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream(), "UTF8"));
wr.write("GET " + strPage + " HTTP/1.0 ");
wr.write("HOST:" + strServer + " ");
wr.write(" ");
wr.flush();
//接收返回的结果
BufferedReader rd = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String line;
while (true) ...{
line = rd.readLine();
if(line.length()>=75)...{
if(line.startsWith("<table border="0" cellpadding="0" cellspacing="0"><tr><td class=f><a href=""))...{
s=line;
break;
}
}
}
wr.close();
rd.close();
}
catch (Exception e) ...{
System.out.println(e.toString());
}
return s;
}//获取百度有用的字段;
public String ClearOnce(String s)...{
int first;
int last;
first = s.indexOf("<");
last = s.indexOf(">");
StringBuilder builder = new StringBuilder(s);
builder.delete(first,last+1);
return builder.toString();
}//清除一次<>
public String Clear(String s)...{
while(s.indexOf(">")>0&s.indexOf(">")<s.length())...{
s=this.ClearOnce(s);
}
return s;
}//清除所有的<>
public String ClearK(String s)...{
return s.substring(0,s.indexOf("- "));
}//去掉百度快照
public String GetUrl(String s)...{
int first;
int lest;
if((first=s.indexOf("class=f"))!=-1)...{
lest=s.indexOf("" target");
s=s.substring(first+17,lest);
return s;
}else
return "";
}//获取URL
public void GetBaidu()...{
int br;
for(int i=0;i<40;i++)...{
br=this.source.indexOf("<br>");
info[i]=this.source.substring(0,br);
this.source=this.source.substring(br+4,this.source.length());
}//将数据进行分段,每四段代表一个信息
for(int i=0;i<40;i++)...{
if((i+1)%4==1)...{
info[i+3]=GetUrl(info[i]);
}
info[i]=Clear(info[i]);
if((i+1)%4==3)...{
info[i]=ClearK(info[i]);
}
//System.out.println(st[i]);
}//将信息整理,从新存储,顺序为 标题、简介、页面信息、URL
for(int i=0;i<10;i++)...{
int j=i*4;
SearchItem item = new SearchItem(info[j],info[j+3],info[j+1],info[j+2]);
this.ItemList[i]=item;
}//将每条信息存入对象数组
}//把信息分段存入数组
public SearchItem[] GetBaiduItemList()...{
return ItemList;
}
public static void main(String[] args)...{
//System.out.print("QiSou.cn Search: ");
//KeyboardInput input = new KeyboardInput();
//String search = input.readString();
CatchBaidu obj = new CatchBaidu("forest",4);
SearchItem[] itemlist = obj.GetBaiduItemList();
for(int i=0;i<10;i++)...{
itemlist[i].print();
}
}
}
import java.io. * ;
import java.net. * ;
/**/ /*
* 这是一个获取百度的类,它可以用一个搜索关键字来初始化,然后将抓取的结果存到SearchItem的对象数组里
*/
public class CatchBaidu ... {
/** *//**
* @param args
* strKey: 搜索关键字
*/
private String strKey;
private String[] info;
private String source;
private SearchItem[] ItemList;
/**//*
* 构造函数,使用搜索关键字构造
*/
public CatchBaidu(String strKey,int pages)...{
this.strKey=strKey;
pages=(pages-1)*10;
this.ItemList = new SearchItem[10];
this.source="http://www.baidu.com/s?wd="+this.strKey+"&tn=cfish828_pg&pn="+pages;
this.source=Socket(this.source);
info = new String[40];
this.GetBaidu();
}
public CatchBaidu(String strKey)...{
this.strKey=strKey;
this.ItemList = new SearchItem[10];
this.source="http://www.baidu.com/s?wd="+this.strKey+"&tn=cfish828_pg&pn="+1;
this.source=Socket(this.source);
info = new String[40];
this.GetBaidu();
}
/**//*
* 使用socket获取制定baidu页面,并对页面进行初级筛选,找出有用的一行返回
*/
public String Socket(String strPage)...{
String strServer="www.baidu.com";
String s=null;
try ...{
String hostname = strServer;
int port = 80;
InetAddress addr = InetAddress.getByName(hostname);
Socket socket = new Socket(addr, port); //建立一个Socket
//发送命令
BufferedWriter wr = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream(), "UTF8"));
wr.write("GET " + strPage + " HTTP/1.0 ");
wr.write("HOST:" + strServer + " ");
wr.write(" ");
wr.flush();
//接收返回的结果
BufferedReader rd = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String line;
while (true) ...{
line = rd.readLine();
if(line.length()>=75)...{
if(line.startsWith("<table border="0" cellpadding="0" cellspacing="0"><tr><td class=f><a href=""))...{
s=line;
break;
}
}
}
wr.close();
rd.close();
}
catch (Exception e) ...{
System.out.println(e.toString());
}
return s;
}//获取百度有用的字段;
public String ClearOnce(String s)...{
int first;
int last;
first = s.indexOf("<");
last = s.indexOf(">");
StringBuilder builder = new StringBuilder(s);
builder.delete(first,last+1);
return builder.toString();
}//清除一次<>
public String Clear(String s)...{
while(s.indexOf(">")>0&s.indexOf(">")<s.length())...{
s=this.ClearOnce(s);
}
return s;
}//清除所有的<>
public String ClearK(String s)...{
return s.substring(0,s.indexOf("- "));
}//去掉百度快照
public String GetUrl(String s)...{
int first;
int lest;
if((first=s.indexOf("class=f"))!=-1)...{
lest=s.indexOf("" target");
s=s.substring(first+17,lest);
return s;
}else
return "";
}//获取URL
public void GetBaidu()...{
int br;
for(int i=0;i<40;i++)...{
br=this.source.indexOf("<br>");
info[i]=this.source.substring(0,br);
this.source=this.source.substring(br+4,this.source.length());
}//将数据进行分段,每四段代表一个信息
for(int i=0;i<40;i++)...{
if((i+1)%4==1)...{
info[i+3]=GetUrl(info[i]);
}
info[i]=Clear(info[i]);
if((i+1)%4==3)...{
info[i]=ClearK(info[i]);
}
//System.out.println(st[i]);
}//将信息整理,从新存储,顺序为 标题、简介、页面信息、URL
for(int i=0;i<10;i++)...{
int j=i*4;
SearchItem item = new SearchItem(info[j],info[j+3],info[j+1],info[j+2]);
this.ItemList[i]=item;
}//将每条信息存入对象数组
}//把信息分段存入数组
public SearchItem[] GetBaiduItemList()...{
return ItemList;
}
public static void main(String[] args)...{
//System.out.print("QiSou.cn Search: ");
//KeyboardInput input = new KeyboardInput();
//String search = input.readString();
CatchBaidu obj = new CatchBaidu("forest",4);
SearchItem[] itemlist = obj.GetBaiduItemList();
for(int i=0;i<10;i++)...{
itemlist[i].print();
}
}
}
.
package
org.qisou;
public class SearchItem ... {
/** *//**
* @param args
* title: 标题
* URL:链接地址
* synopsis:简介
* info:页面信息
*/
//private String ID;
private String title;
private String URL;
private String synopsis;
private String info;
public String GetTitle()...{
return title;
}
public String GetURL()...{
return URL;
}
public String GetSynopsis()...{
return synopsis;
}
public String GetInfo()...{
return info;
}
public SearchItem(String title ,String URL ,String synopsis ,String info)...{
this.title=title;
this.URL=URL;
this.synopsis=synopsis;
this.info=info;
}
public SearchItem()...{
}
public void print()...{
System.out.println(this.GetURL());
System.out.println(this.GetTitle());
System.out.println(this.GetSynopsis());
System.out.println(this.GetInfo());
}
public static void main(String[] args) ...{
// TODO Auto-generated method stub
SearchItem obj = new SearchItem("title","URL","synopsis","info");
obj.print();
}
}
public class SearchItem ... {
/** *//**
* @param args
* title: 标题
* URL:链接地址
* synopsis:简介
* info:页面信息
*/
//private String ID;
private String title;
private String URL;
private String synopsis;
private String info;
public String GetTitle()...{
return title;
}
public String GetURL()...{
return URL;
}
public String GetSynopsis()...{
return synopsis;
}
public String GetInfo()...{
return info;
}
public SearchItem(String title ,String URL ,String synopsis ,String info)...{
this.title=title;
this.URL=URL;
this.synopsis=synopsis;
this.info=info;
}
public SearchItem()...{
}
public void print()...{
System.out.println(this.GetURL());
System.out.println(this.GetTitle());
System.out.println(this.GetSynopsis());
System.out.println(this.GetInfo());
}
public static void main(String[] args) ...{
// TODO Auto-generated method stub
SearchItem obj = new SearchItem("title","URL","synopsis","info");
obj.print();
}
}