//自己写的网络爬虫,希望大家共同探讨交流
package Crawl;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class Indepent {
public static void main(String args[])
{
Set<String> visitedurl = new HashSet<String>();
LinkedList unvisitedurl = new LinkedList();
String seed = "http://www.xyz.abc";
int i = 1;
String visiturl;
unvisitedurl.add(seed);
visiturl = seed;
while(visiturl!=null)
{
Set<String> link = new HashSet<String>();
visiturl = (String) unvisitedurl.removeFirst();
if(visitedurl.contains(visiturl))
continue;
catchallseed(visiturl,i);
link = extractMain("HTMLDOWNLOAD\\"+i+".html");
i++;
unvisitedurl.addAll(link);
visitedurl.add(visiturl);
System.out.println(visiturl);
}
}
public static Set<String> extractMain(String filename)
{
Set<String> links = new HashSet<String>();
File file = new File(filename);
Reader reader = null; //创建一个读对象
try{
reader =new InputStreamReader(new FileInputStream(file));
StringBuffer input = new StringBuffer();
int ch;
int i=1;
while ((ch = reader.read()) != -1)
input.append((char)ch);
String patternString = "(http:|https:)//[^[A-Za-z0-9\\._\\?%&+\\-=/#]]*";
Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(input);
reader.close();
while (matcher.find())
{
int start = matcher.start();
int end = matcher.end();
String match = input.substring(start, end);
//System.out.println(++i + " : " + match);
links.add(match);
}
}
catch (Exception e) {
e.printStackTrace();
}
return links;
}
public static void catchallseed(String url,int number)
{
HttpClient httpClient = new HttpClient();
GetMethod getMethod = new GetMethod(url); //初始化连接函数
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler()); //设置默认的异常恢复策略
try {
int statusCode = httpClient.executeMethod(getMethod); //建立连接
if (statusCode != HttpStatus.SC_OK)
{
System.err.println("Method failed: "
+ getMethod.getStatusLine());
} //获取连接建立状态结果
byte[] responseBody = getMethod.getResponseBody(); //获取网页内容
FileOutputStream fos=new FileOutputStream("HTMLDOWNLOAD\\"+number+".html"); //打开文件进行写入
fos.write(responseBody); //写入
//System.out.println(new String(responseBody));
}
catch (HttpException e)
{
System.out.println("Please check your provided http address!");
e.printStackTrace(); //打印错误原因
}
catch (IOException e)
{
e.printStackTrace();
} finally
{
getMethod.releaseConnection(); //释放连接
}
}
}