import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 只会爬取在该域名域内的链接(二级,三级)
* 图片资源可以去取第三方网站的内容
* Created:2010-6-22
* @author:Gerry
* @version:
*/
public class WebProcess {
private String urlName ; //链接原型
private String urlFileName; //重命名后的文件名
private String html ; //html内容
public String getUrlFileName() {
return urlFileName;
}
public void setUrlFileName(String urlFileName) {
this.urlFileName = urlFileName;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getUrlName() {
return urlName;
}
public void setUrlName(String urlName) {
this.urlName = urlName;
}
public ByteArrayOutputStream getAddressContext(String str_url) throws Exception {
URLConnection conn = null;
String str_urlhead = "http://" ;
str_url = str_urlhead + str_url ;
URL url = new URL(str_url);
System.setProperty("http.proxyHost", "openproxy.xxxxxx.com");//setting proxy host
System.setProperty("http.proxyPort", "8080");//setting proxy host port
conn = url.openConnection();
if (conn == null){return null;}
conn.setRequestProperty("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/2008052906 Firefox/3.0");
conn.setUseCaches(false);
InputStream ins = conn.getInputStream();
ByteArrayOutputStream outputstream = new ByteArrayOutputStream();
byte[] str_b = new byte[1024];
int i = -1;
while ((i=ins.read(str_b)) > 0){
outputstream.write(str_b,0,i);
}
ins.close();
outputstream.close();
//all_content = outputstream.toString("UTF-8");
//return new String(all_content.getBytes("ISO8859-1"));
return outputstream;
}
private int writeFolder(String path){
File file = new File(path);
file.mkdir();
return 1 ;
}
/**
* 写文件的实现类
* content==null 的时候那么写入二进制文件
* content不为null 的时候就写入文本文件
*/
private int writeFile(String path,String content,ByteArrayOutputStream outputstream) throws IOException{
File f = new File(path);
f.createNewFile();
FileOutputStream fout = new FileOutputStream(f);
if(content==null){
fout.write(outputstream.toByteArray());
}else{
fout.write(content.getBytes("utf-8"));
}
fout.flush();
fout.close();
return 1 ;
}
/**
* 关于替换资源的几种情况(不保留原始网站目录结构)递归调用
* 一级目录 1.js 外部引用文件
* 一级目录 2.css 外部引用文件
* 一级目录 3.<img src="" width=0 height=0 /> 标签中引用的图片
* 一级目录 4.关于css中引用的图片
* 二级目录 1.<a href="#"></a> 标签中重写url链接
* @param filePath
* type = 1 (src 资源文件)
* type = 2 (href 二级,其他文件) link href="" CSS样式表
* @param
* 重命名规则
* 1.http://hi.csdn.net/js/jquery-1.4.2.min.js
* /用 - 符号替换
* http:// 替换为空
* 2.重命名后为 http--hi.csdn.net-js-jquery-1.4.2.min.js
* @return
* @throws Exception
* Created:2010-6-22
* @author:Gerry
*/
private List<WebProcess> parseHtmlContent(String type,String html,List<WebProcess> list) throws Exception{
if(type==null){type="1";}
if(type.equals("1")){
String regEx = "src=['\"]http://?([^'\"<>]+)['\"]?";
Pattern p = Pattern.compile(regEx,Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
if(m.find()){
String group1 = m.group(1);
if(group1.indexOf("/")>-1){//判断是否为未重写以前的
WebProcess webProcess = new WebProcess();
String tempTr = m.group(0);
tempTr = tempTr.replaceAll("http://", "");
tempTr = tempTr.replaceAll("HTTP://", "");
tempTr = tempTr.replaceAll("/", "-");
String tempTr2 = m.group(1);
tempTr2 = tempTr2.replaceAll("/", "-");
webProcess.setUrlName(m.group(1));//原始http:url链接名字
webProcess.setUrlFileName(tempTr2);//重命名后的名字
list.add(webProcess);
html = m.replaceFirst(tempTr);//修改源文件中的资源链接
return parseHtmlContent("1",html,list);
}else{
return parseHtmlContent("1",html,list);
} }else {
WebProcess webProcess = new WebProcess();
webProcess.setHtml(html);
list.add(webProcess);
return list ;
}
}else if(type.equals("2")){
}
return list ;
}
/**
* 核心业务方法
* @param webname
* @return
* @throws Exception
* Created:2010-6-22
* @author:Gerry
*/
private int kernelBusiness(String webname) throws Exception{
WebProcess process = new WebProcess();
System.out.println("正在读取远程文件:"+webname+"...");
String content = process.getAddressContext(webname).toString("UTF-8");
System.out.println("远程文件读取完毕!");
List<WebProcess> list = process.parseHtmlContent("1",content,new ArrayList<WebProcess>());
if(list!=null && list.size()>0){
content = list.get(list.size()-1).getHtml();
}
System.out.println("一共是:"+list.size());
//对本地写文件的相关操作
System.out.println("正在写入文件index.html到本地...");
String localAddress = "F:\\"+webname ;//修改磁盘地址
process.writeFolder(localAddress);
process.writeFile(localAddress+"\\index.html",content,null);
System.out.println("文件index.html保存完毕!"); //对本地写主页中的资源文件
for(int i=0;i<list.size();i++){
WebProcess webprocess = (WebProcess)list.get(i);
String resourceUrl = webprocess.getUrlName();
System.out.println("正在读取远程文件:"+resourceUrl+"...");
ByteArrayOutputStream outputstreamResource = new ByteArrayOutputStream();
try{
outputstreamResource = process.getAddressContext(resourceUrl);
}catch(Exception ex){
System.out.println("远程连接出现异常!!!请求http://"+resourceUrl+"失败!");
}
System.out.println("远程文件读取完毕!");
System.out.println("正在写入文件"+resourceUrl+"到本地...");
try{
//判断输出的是文本还是二进制文件流
if(webprocess.getUrlFileName()!=null){
if(webprocess.getUrlFileName().length()>3){
String temp_urlfilename = webprocess.getUrlFileName();
String temp_lastfilesub = temp_urlfilename.substring(temp_urlfilename.length()-3,temp_urlfilename.length());
temp_lastfilesub = temp_lastfilesub.toUpperCase();
if(temp_lastfilesub.equals(".JS") || temp_lastfilesub.equals("CSS")){
process.writeFile(localAddress+"\\"+webprocess.getUrlFileName(),outputstreamResource.toString("UTF-8"),null);
}else if(temp_lastfilesub.equals("SWF") || temp_lastfilesub.equals("GIF") || temp_lastfilesub.equals("JPG") || temp_lastfilesub.equals("PNG")){
process.writeFile(localAddress+"\\"+webprocess.getUrlFileName(),null,outputstreamResource);
}
}
}
}catch(Exception ex){
System.out.println("文件写入出现异常!!!文件"+webprocess.getUrlFileName()+"失败!");
}
System.out.println("文件"+resourceUrl+"保存完毕!");
}
return 1 ;
}
public static void main(String[] args) throws Exception {
long startTime = System.currentTimeMillis();
System.out.println("开始执行时间:=="+startTime);
String args1 = "www.csdn.net" ;
WebProcess process = new WebProcess();
process.kernelBusiness(args1);
///
long endTime = System.currentTimeMillis();
long wasterTime = endTime - startTime;
System.out.println("结束执行时间:=="+endTime);
System.out.println("一共执行了时间:=="+wasterTime);//6172 ms
}
}