网络爬虫

最新推荐文章于 2024-08-31 14:09:04 发布

a0agd1X50

最新推荐文章于 2024-08-31 14:09:04 发布

阅读量830

点赞数

文章标签： java 网络爬虫爬虫编码搜索引擎

本文链接：https://blog.csdn.net/a0agd1X50/article/details/16357059

版权

/*这里我的网络爬虫大体实现了网络爬虫的基本功能，采取的是宽度优先搜索，宽度优先搜索的好处是可以较快的访问到重要网页，因为重要网页往往离种子网页较近，且利于多爬虫合作。基本的原理是将网页先保存下来，将该Url放入已访问链接，再提取网页中的超链接，放入未访问队列。内容或多或少，肯定有不足之处，恳请大家不吝赐教。*/

/*MyClawler.java*/

import java.util.Set;


public class MyClawler {
	private void initCrawlerWithSeeds(String[] seeds)
	{
		for(int i=0;i<seeds.length;i++)
		{
			LinkQueue.addUnvisitedUrl(seeds[i]);
		}
	}
	
	public void crawling(String[] seeds)
	{
/******************************************************************************/
		//定义过滤器
/*		LinkFilter filter=new LinkFilter()
		{
			public boolean accept(String url)
			{
				if(url.startsWith("http://www.baidu.com"))
					return true;
				else 
					return false;
			}
		};
*/
/******************************************************************************/
		
		initCrawlerWithSeeds(seeds);
		
		while(!LinkQueue.unvisitedUrlsEmpty())
		{
			String visitUrl=(String)LinkQueue.unvisitedUrlDeQueue();// 未访问队列队首Url出列
			if(visitUrl==null)
				continue;
			DownLoadFile downLoader=new DownLoadFile();
			downLoader.downloadFile(visitUrl);// 下载网页
			LinkQueue.addVisitedUrl(visitUrl);// 讲该Url放入已访问队列
			Set<String> links=HtmlParserTool.extractLinks(visitUrl);// 提取网页中的链接
			for(String link:links)
			{
				LinkQueue.addUnvisitedUrl(link);// 将链接放入未访问队列
			}
			
		}
		
	}
	
	public static void main(String[] args)
	{
		MyClawler clawler=new MyClawler();
		clawler.crawling(new String[]{"http://www.baidu.com"});
		System.out.println("done");
	}
}

/**DownLoadFile.java*/

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.Header;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;


public class DownLoadFile {
	
	//根据URL和网页类型生成需要保存的网页的文件名，去除URL中的非文件名字符
	public String getFileNameByUrl(String url,String contentType)
	{
		url=url.substring(7);
		//text/html类型
		if(contentType.indexOf("html")!=-1)
		{
			url=url.replaceAll("[\\?/:|<>\"]","_")+".html";
			return url;
		}
		else
		{
			return url.replaceAll("[\\?/:|<>\"]","_")+"."
					+contentType.substring(contentType.lastIndexOf("/")+1);
		}
	}
	
	//保存网页字节数组到本地文件，filePath为要保存的文件的相对路径
	
	
	//下载URL指向的网页
	public String downloadFile(String url)
	{
		String filePath=null;
		CloseableHttpClient httpclient=HttpClients.createDefault();
		HttpGet httpGet=new HttpGet(url);
		try {
			CloseableHttpResponse response=httpclient.execute(httpGet);
			if(response.getStatusLine().getStatusCode()==200)
			{
				
/**************************************************************************************/
				//提取网页编码方式
/*				Header[] headers=response.getAllHeaders();
				
				String charset=null;
				int temp=-1;
				for(int i=0;i<headers.length;i++)
				{
					if((temp=headers[i].getValue().indexOf("charset="))!=-1)
					{
//						int end=headers[i].getValue().indexOf("\"");
//						if(end==-1)
//							end=headers[i].getValue().indexOf(">");
//						charset=headers[i].getValue().substring(temp+8,end-1);
						charset=headers[i].getValue().substring(temp+8);
						break;
					}
				}
*/				
				InputStream in=response.getEntity().getContent();
				
				
				String charset=null;
				byte b[]=null;
				int contentLength=in.available();
				if(contentLength>1000)
				{
					contentLength=1000;
				}
				
				b=new byte[1000];
				in.read(b,0,contentLength);
				String strTmp=new String(b);
				Pattern p;
				Matcher m;
				String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8";
				p=Pattern.compile(regex);
				m=p.matcher(strTmp);
				if(m.find())
				{
					charset=m.group();
				}
				else
				{
					charset="utf-8";
				}
				
				
//				BufferedReader br=new BufferedReader(new InputStreamReader(in));
//				if(charset==null)
//				{
//					String line="";
//					StringBuffer buffer=new StringBuffer();
//					while((line=br.readLine())!=null)
//					{
//						buffer.append(line);
//					}
//					line=buffer.toString();
//					int a=line.indexOf("charset=");
//					String str=line.substring(a);
//					charset=str.substring(8,str.indexOf("\""));
//				}
//				if(charset==null)
//				{
//					charset="utf-8";
//				}

/*************************************************************************************/
				//得到网页内容
				BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset));
/*************************************************************************************/
				String a=response.getFirstHeader("Content-Type").getValue();
				filePath="E:\\temp\\"
				+getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
				saveToLocal(responseBody,filePath);
			}
			else
			{
				System.err.print("Method Failed:"+response.getStatusLine().getStatusCode());
			}
		} catch (ClientProtocolException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}catch(Exception e){
			e.printStackTrace();
		}
		
		try {
			httpclient.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	
		return filePath;
	}
	
	private void saveToLocal(BufferedReader responseBody,String filePath) throws IOException
	{
//		int ch;
//		FileWriter fw=new FileWriter(filePath);
//		
//		
//		while((ch=responseBody.read())!=-1){
//			fw.write(ch);
//		}
//		responseBody.close();
//		fw.close();
//		
//		return ;
		
		String line="";
		StringBuffer buffer=new StringBuffer();
		while((line=responseBody.readLine())!=null)
		{
			buffer.append(line);
		}
		line=buffer.toString();
		System.out.println(line);//输出源码
/**********************************************************************************************/
		//向文件中写入源码字符串
		FileWriter fw1=new FileWriter(filePath);
		fw1.write(line);
		fw1.close();
		
//		DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath)));
//		for(int i=0;i<b.length;i++)
//		{
//			out.write(b[i]);
//		}
	}
/*****************************************************************************************/
	//调试用
//	public static void main(String[] args)
//	{
//		DownLoadFile df=new DownLoadFile();
//		df.downloadFile("http://www.baidu.com");
//	}
}
/**HtmlParserTool.java*/

import java.util.HashSet;
import java.util.Set;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class HtmlParserTool {
	public static Set<String> extractLinks(String url)
	{
		Set<String> links=new HashSet<String>();
		NodeList nodeList;
		try {
			Parser parser=new Parser(url);
			parser.setEncoding("gb2312");
			NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
			OrFilter lastFilter=new OrFilter();
			lastFilter.setPredicates(new NodeFilter[]{linkFilter});
//			parser.setEncoding("gb2312");
			nodeList=parser.parse(lastFilter);
			Node[] nodes=nodeList.toNodeArray();
			String link="";
			for(int i=0;i<nodes.length;i++)
			{
				if(nodes[i] instanceof  LinkTag)//	<a>  标签
				{
					LinkTag linkNode=(LinkTag)(nodes[i]);
					link=linkNode.getLink();
					links.add(link);
				}
				else//	<frame标签>
				{
					//提取frame里src属性的链接，如<frame src="test.html"/>
					String frame=nodes[i].getText();
					int start=frame.indexOf("src");
					int end=frame.indexOf(" ");
					if(end==-1)
					{
						end=frame.indexOf(">");
					}
					String frameUrl=frame.substring(start+5, end-1);
					links.add(frameUrl);
				}
			}
		} catch (ParserException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		catch(Exception e){
			e.printStackTrace();
		}
		return links;
	}
}
/**Queue.java*/

import java.util.LinkedList;


public class Queue {
	private LinkedList queue=new LinkedList();
	
	public void enQueue(Object t)
	{
		queue.add(t);
	}
	
	public Object deQueue()
	{
		return queue.removeFirst();
	}
	
	public boolean isQueueEmpty()
	{
		return queue.isEmpty();
	}
	
	public boolean contains(Object t)
	{
		return queue.contains(t);
	}

}


/**LinkQueue.java*/

import java.util.HashSet;
import java.util.Set;


public class LinkQueue {
	private static Set visitedUrl=new HashSet();
	private static Queue unvisitedUrl=new Queue();
	
//	public static Queue getUnvisitedUrl()
//	{
//		return unvisitedUrl;
//	}
	
	public static Object unvisitedUrlDeQueue()
	{
		return unvisitedUrl.deQueue();
	}
	
	public static void addVisitedUrl(String url)
	{
		visitedUrl.add(url);
	}
	
	public static void addUnvisitedUrl(String url)
	{
		if(url!=null&&!url.trim().equals("")//  trim()返回字符串的副本，忽略前导空白和尾部空白。
				&&!visitedUrl.contains(url)&&!unvisitedUrl.contains(url))
		{
			unvisitedUrl.enQueue(url);
		}
	}
	
	public static boolean unvisitedUrlsEmpty()
	{
		return unvisitedUrl.isQueueEmpty();
	}
	
	public static int getVisitedUrlNum()
	{
		return visitedUrl.size();
	}
	
}


参考文献：《自己动手写网络爬虫》