/*这里我的网络爬虫大体实现了网络爬虫的基本功能,采取的是宽度优先搜索,宽度优先搜索的好处是可以较快的访问到重要网页,因为重要网页往往离种子网页较近,且利于多爬虫合作。基本的原理是将网页先保存下来,将该Url放入已访问链接,再提取网页中的超链接,放入未访问队列。内容或多或少,肯定有不足之处,恳请大家不吝赐教。*/
/*MyClawler.java*/
import java.util.Set;
public class MyClawler {
private void initCrawlerWithSeeds(String[] seeds)
{
for(int i=0;i<seeds.length;i++)
{
LinkQueue.addUnvisitedUrl(seeds[i]);
}
}
public void crawling(String[] seeds)
{
/******************************************************************************/
//定义过滤器
/* LinkFilter filter=new LinkFilter()
{
public boolean accept(String url)
{
if(url.startsWith("http://www.baidu.com"))
return true;
else
return false;
}
};
*/
/******************************************************************************/
initCrawlerWithSeeds(seeds);
while(!LinkQueue.unvisitedUrlsEmpty())
{
String visitUrl=(String)LinkQueue.unvisitedUrlDeQueue();// 未访问队列队首Url出列
if(visitUrl==null)
continue;
DownLoadFile downLoader=new DownLoadFile();
downLoader.downloadFile(visitUrl);// 下载网页
LinkQueue.addVisitedUrl(visitUrl);// 讲该Url放入已访问队列
Set<String> links=HtmlParserTool.extractLinks(visitUrl);// 提取网页中的链接
for(String link:links)
{
LinkQueue.addUnvisitedUrl(link);// 将链接放入未访问队列
}
}
}
public static void main(String[] args)
{
MyClawler clawler=new MyClawler();
clawler.crawling(new String[]{"http://www.baidu.com"});
System.out.println("done");
}
}
/*
*DownLoadFile.java
*/
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class DownLoadFile {
//根据URL和网页类型生成需要保存的网页的文件名,去除URL中的非文件名字符
public String getFileNameByUrl(String url,String contentType)
{
url=url.substring(7);
//text/html类型
if(contentType.indexOf("html")!=-1)
{
url=url.replaceAll("[\\?/:|<>\"]","_")+".html";
return url;
}
else
{
return url.replaceAll("[\\?/:|<>\"]","_")+"."
+contentType.substring(contentType.lastIndexOf("/")+1);
}
}
//保存网页字节数组到本地文件,filePath为要保存的文件的相对路径
//下载URL指向的网页
public String downloadFile(String url)
{
String filePath=null;
CloseableHttpClient httpclient=HttpClients.createDefault();
HttpGet httpGet=new HttpGet(url);
try {
CloseableHttpResponse response=httpclient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
/**************************************************************************************/
//提取网页编码方式
/* Header[] headers=response.getAllHeaders();
String charset=null;
int temp=-1;
for(int i=0;i<headers.length;i++)
{
if((temp=headers[i].getValue().indexOf("charset="))!=-1)
{
// int end=headers[i].getValue().indexOf("\"");
// if(end==-1)
// end=headers[i].getValue().indexOf(">");
// charset=headers[i].getValue().substring(temp+8,end-1);
charset=headers[i].getValue().substring(temp+8);
break;
}
}
*/
InputStream in=response.getEntity().getContent();
String charset=null;
byte b[]=null;
int contentLength=in.available();
if(contentLength>1000)
{
contentLength=1000;
}
b=new byte[1000];
in.read(b,0,contentLength);
String strTmp=new String(b);
Pattern p;
Matcher m;
String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8";
p=Pattern.compile(regex);
m=p.matcher(strTmp);
if(m.find())
{
charset=m.group();
}
else
{
charset="utf-8";
}
// BufferedReader br=new BufferedReader(new InputStreamReader(in));
// if(charset==null)
// {
// String line="";
// StringBuffer buffer=new StringBuffer();
// while((line=br.readLine())!=null)
// {
// buffer.append(line);
// }
// line=buffer.toString();
// int a=line.indexOf("charset=");
// String str=line.substring(a);
// charset=str.substring(8,str.indexOf("\""));
// }
// if(charset==null)
// {
// charset="utf-8";
// }
/*************************************************************************************/
//得到网页内容
BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset));
/*************************************************************************************/
String a=response.getFirstHeader("Content-Type").getValue();
filePath="E:\\temp\\"
+getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
saveToLocal(responseBody,filePath);
}
else
{
System.err.print("Method Failed:"+response.getStatusLine().getStatusCode());
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch(Exception e){
e.printStackTrace();
}
try {
httpclient.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return filePath;
}
private void saveToLocal(BufferedReader responseBody,String filePath) throws IOException
{
// int ch;
// FileWriter fw=new FileWriter(filePath);
//
//
// while((ch=responseBody.read())!=-1){
// fw.write(ch);
// }
// responseBody.close();
// fw.close();
//
// return ;
String line="";
StringBuffer buffer=new StringBuffer();
while((line=responseBody.readLine())!=null)
{
buffer.append(line);
}
line=buffer.toString();
System.out.println(line);//输出源码
/**********************************************************************************************/
//向文件中写入源码字符串
FileWriter fw1=new FileWriter(filePath);
fw1.write(line);
fw1.close();
// DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath)));
// for(int i=0;i<b.length;i++)
// {
// out.write(b[i]);
// }
}
/*****************************************************************************************/
//调试用
// public static void main(String[] args)
// {
// DownLoadFile df=new DownLoadFile();
// df.downloadFile("http://www.baidu.com");
// }
}
/*
*HtmlParserTool.java
*/
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HtmlParserTool {
public static Set<String> extractLinks(String url)
{
Set<String> links=new HashSet<String>();
NodeList nodeList;
try {
Parser parser=new Parser(url);
parser.setEncoding("gb2312");
NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
OrFilter lastFilter=new OrFilter();
lastFilter.setPredicates(new NodeFilter[]{linkFilter});
// parser.setEncoding("gb2312");
nodeList=parser.parse(lastFilter);
Node[] nodes=nodeList.toNodeArray();
String link="";
for(int i=0;i<nodes.length;i++)
{
if(nodes[i] instanceof LinkTag)// <a> 标签
{
LinkTag linkNode=(LinkTag)(nodes[i]);
link=linkNode.getLink();
links.add(link);
}
else// <frame标签>
{
//提取frame里src属性的链接,如<frame src="test.html"/>
String frame=nodes[i].getText();
int start=frame.indexOf("src");
int end=frame.indexOf(" ");
if(end==-1)
{
end=frame.indexOf(">");
}
String frameUrl=frame.substring(start+5, end-1);
links.add(frameUrl);
}
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
catch(Exception e){
e.printStackTrace();
}
return links;
}
}
/*
*Queue.java
*/
import java.util.LinkedList;
public class Queue {
private LinkedList queue=new LinkedList();
public void enQueue(Object t)
{
queue.add(t);
}
public Object deQueue()
{
return queue.removeFirst();
}
public boolean isQueueEmpty()
{
return queue.isEmpty();
}
public boolean contains(Object t)
{
return queue.contains(t);
}
}
/*
*LinkQueue.java
*/
import java.util.HashSet;
import java.util.Set;
public class LinkQueue {
private static Set visitedUrl=new HashSet();
private static Queue unvisitedUrl=new Queue();
// public static Queue getUnvisitedUrl()
// {
// return unvisitedUrl;
// }
public static Object unvisitedUrlDeQueue()
{
return unvisitedUrl.deQueue();
}
public static void addVisitedUrl(String url)
{
visitedUrl.add(url);
}
public static void addUnvisitedUrl(String url)
{
if(url!=null&&!url.trim().equals("")// trim()返回字符串的副本,忽略前导空白和尾部空白。
&&!visitedUrl.contains(url)&&!unvisitedUrl.contains(url))
{
unvisitedUrl.enQueue(url);
}
}
public static boolean unvisitedUrlsEmpty()
{
return unvisitedUrl.isQueueEmpty();
}
public static int getVisitedUrlNum()
{
return visitedUrl.size();
}
}
参考文献:《自己动手写网络爬虫》