大致意思就是先给定一个URL,然后用HttpParser开源工具提取其网页上的链接,并丢到队列里,然后取队列的首URL重复此操作。
需要用到HttpClient和HttpParser包。
这里先给出所有.Java文件的源码。
其中还有些问题,今天不想看了,所有问题明天再说。
今天做了些修改,还是不能保存到文件,但是应该没有其他的问题了,主要修改了运用Queue来操作,还有修改了提取的Url可能不正确,这里面加了http的判断
-------------------------------------------------------------------------------------MyCrawler.java---------------------------------------------------------------------------------
package MyCrawler;
import java.util.Set;
/**
*
* @author xkey
*/
public class MyCrawler {
/**
* 使用种子初始化URL队列
* @return
* @param seeds 种子URL
*/
private static void initCrawlerWithSeeds(String[] seeds)
{
for(int i = 0 ; i < seeds.length ; i ++)
{
LinkQueue.addUnVisitedUrl(seeds[i]);
}
}
/**
* 抓取过程
* @return
* @param seeds
*/
public static void crawling (String[] seeds)
{
initCrawlerWithSeeds(seeds);
while(!LinkQueue.unVisitedUrlIsEmpty() && LinkQueue.getVisitedUrlNum() < 1000)
{
//队头URL
String visitUrl = (String)LinkQueue.unVisitedUrlDequeue();
System.out.println(visitUrl);
if(visitUrl == null) continue;
DownLoaderFile downLoader = new DownLoaderFile();
downLoader.downLoaderFile(visitUrl);
LinkQueue.addVisitedUrl(visitUrl);
//提取出下载网页中的URL
LinkFilter filter = new LinkFilter();
Set<String>links = HtmlParserTool.extracLinks(visitUrl);
//新的未访问的URL入对
for(String link:links)
{
System.out.println("xkey: "+link);
LinkQueue.addUnVisitedUrl(link);
}
}
}
public static void main(String[] args)
{
MyCrawler crawler = new MyCrawler();
crawler.crawling(new String[]{"http://www.baidu.com"});
}
}
-------------------------------------------------------------------------------------LinkQueue.java------------------------------------------------------------------------------------
public class LinkQueue {
//已访问的URL集合
private static Set visitedUrl = new HashSet();
//待访问的URL集合
private static Queue <String> unVisitedUrl = new ConcurrentLinkedQueue<String>();
//获得URL队列
public static Queue getUnVisitedUrl()
{
return unVisitedUrl;
}
//添加到访问过的URL队列中
public static void addVisitedUrl(String Url)
{
visitedUrl.add(Url);
}
//移除访问过的URL
public static void removeVisitedUrl(String Url)
{
visitedUrl.remove(Url);
}
//未访问过的URL出队列
public static Object unVisitedUrlDequeue()
{
return unVisitedUrl.poll();
}
//保证每个URL只被访问一次
public static void addUnVisitedUrl(String Url)
{
if(Url != null && !Url.trim().equals("") && !visitedUrl.contains(Url) && !unVisitedUrl.contains(Url))
unVisitedUrl.add(Url);
}
//获得已经访问的URL数目
public static int getVisitedUrlNum()
{
return visitedUrl.size();
}
//判断未访问的URL队列是否为空
public static boolean unVisitedUrlIsEmpty()
{
return unVisitedUrl.isEmpty();
}
}
-------------------------------------------------------------------------------------DownLoaderFile.java---------------------------------------------------------------------------------
public class DownLoaderFile {
public String getFileNameByUrl(String url,String contentType)
{
//移除http
url = url.substring(7);
//text.html类型
if(contentType.indexOf("html") != -1)
{
url = url.replaceAll("[\\?/:*|<>\"]","_")+".html";
return url;
}
//pdf类型
else
{
return url.replaceAll("[\\?/:*|<>\"]","_")+"."+contentType.substring(contentType.lastIndexOf("/")+1);
}
}
/**
* 保存网页字节数组到本地文件,filepath为要保存的文件相对地址
*/
private void saveToLocal(byte[] data,String filePath)
{
try{
DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
for(int i = 0 ; i < data.length ; i ++)
{
out.write(data[i]);
}
out.flush();
out.close();
}catch(IOException e)
{
e.printStackTrace();
}
}
//下载URL指向的网页
public String downLoaderFile(String url)
{
String filePath = null;
//1.生成HttpClient对象并设置参数
HttpClient httpClient = new HttpClient();
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);
//2.生成GetMethod对象并设置参数
GetMethod getMethod = new GetMethod(url);
//设置get 请求超时5s
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
//3.执行http get qi请求
try{
int statusCode = httpClient.executeMethod(getMethod);
if(statusCode != HttpStatus.SC_OK)
{
System.err.println("Method failed: " + getMethod.getStatusLine());
filePath = null;
}
//4.处理HTTP响应内容
byte[] responseBody = getMethod.getResponseBody();
//根据网页URL生成保存时的文件名
// String ans = new String (responseBody);
// System.out.println(ans);
//filePath = "D;/xkey";
// filePath = "D:\\xkey\\" + getFileNameByUrl(url,getMethod.getResponseHeader("Content-Type").getValue());
// saveToLocal(responseBody,filePath);
}catch(HttpException e){
System.out.println("Please check your provided http address!");
e.printStackTrace();
}catch (IOException e)
{
e.printStackTrace();
}finally {
getMethod.releaseConnection();
}
return filePath;
}
}
-------------------------------------------------------------------------------------HttpParserTool.java--------------------------------------------------------------------------------------------------------------
public class HtmlParserTool {
//获取一个网站上的URL,filter用来过滤链接
public static Set<String>extracLinks(String url)
{
Set<String>links = new HashSet<String>();
try{
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
NodeFilter frameFilter = new NodeFilter(){
public boolean accept(Node node)
{
if(node.getText().startsWith("frame src=")){
return true;
}
else return false;
}
};
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for(int i = 0 ; i < list.size() ; i ++)
{
Node tag = list.elementAt(i);
if(tag instanceof LinkTag) //<a>标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();//URL
if(linkUrl.contains("http"))
links.add(linkUrl);
}else {//<frame>标签
//提取frame 里src属性的链接,如<frame src = "test.html">
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if(end == -1) end = frame.indexOf(">");
String frameUrl = frame.substring(5,end - 1);
if(frameUrl.contains("http"))
links.add(frameUrl);
}
}
}catch(ParserException e)
{
e.printStackTrace();
}
return links;
}
}