网页使用ie另存为htm文件时,css样式文件的图片路径不对,进行修改,并下载图片。
最近老是要做html Demo。就写了一个工具类。
package com.chruan.html.ie;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import com.chruan.util.DirUtil;
/**
* 网页使用ie另存为htm文件时,css样式文件的图片路径不对,进行修改,并下载图片。
*
* @version 1.0.0
* @author chruan
*
*/
public class IeHtmlTool {
/**
* 另存为网页的网址
*/
private String pageUrl;
/**
* 网页另存为的htm文件路径
*/
private String htmlFilePath = null;
/**
* _files文件夹
*/
private String files = null;
private String domainPage = null;
private String pathPage = "/";
private String domainCss;
private String pathCss;
private String[] cssLinks = null;
/**
*
* @param pageUrl
* @param htmlFilePath
*/
public IeHtmlTool(String pageUrl, String htmlFilePath) {
super();
this.pageUrl = pageUrl.toLowerCase();
parsePageUrl(pageUrl);
this.htmlFilePath = htmlFilePath;
initHtmlFiles();
}
/**
* get css links from pageUrl. format: <link href="url"/>
*/
private void spiderSylteLink() {
try {
List list = new ArrayList();
URL url = new URL(pageUrl);
InputStream is = url.openStream();
int count = 0;
// down html sourse.
int n = 0;
byte[] b = new byte[2048];
StringBuffer sb = new StringBuffer();
while ((n = is.read(b)) > 0) {
String str1 = new String(b, 0, n);
sb.append(str1);
}
// find link for css
String str = sb.toString();
int start = 0;
while ((start = str.indexOf("<link", start)) > -1) {
start = str.indexOf("href=\"", start + 5);
start += "href=\"".length();
int end = str.indexOf("\"", start);
if (end > 0) {
count++;
String link = str.substring(start, end);
if (link.startsWith("http")) {
} else if (link.startsWith("/")) {
link = domainPage + link;
} else if (link.startsWith(".")) {
link = prefyUrl(pathPage, link);
link = domainPage + link;
} else {
link = domainPage + pathPage + link;
}
list.add(link);
System.out.println("css link: " + link);
start = end;
}
}
System.out.println("total link: " + count);
if (list.size() > 0) {
cssLinks = (String[]) list.toArray(new String[list.size()]);
}
list.clear();
list = null;
} catch (Exception e) {
}
}
/**
* parse css file and get img url. down imgs from url. rewrite img url in
* css.
*/
public void prefyCss() {
spiderSylteLink();
File fold = new File(files);
String type = ".css";
List list = new ArrayList();
DirUtil.dir(list, fold, type);
for (int idx = 0, size = list.size(); idx < size; idx++) {
String css = (String) list.get(idx);
downCssImg(css);
}
}
private void downCssImg(String css) {
File file = new File(css);
parseCssLink(file.getName());
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PrintWriter writer = new PrintWriter(baos);
BufferedReader reader = new BufferedReader(new FileReader(file));
String line;
while ((line = reader.readLine()) != null) {
boolean isOut = false;
if (line.indexOf("url(") > -1) {
int start = line.indexOf("url(") + 4;
int end = line.indexOf(")", start);
if (end > -1) {
String img = line.substring(start, end);
// System.out.println(img);
String imgn = parseImgName(img);
String newline = line.substring(0, start) + "images/"
+ imgn + line.substring(end);
isOut = true;
writer.write(newline);
String link = "";
img = img.trim();
if (img.startsWith("http:") || img.startsWith("https:")) {
link = img;
} else if (img.startsWith("/")) {
link = domainCss + img;
} else if (img.startsWith(".")) {
link = domainCss + prefyUrl(pathCss, img);
} else {
link = domainCss + prefyUrl(pathCss, img);
}
dowmImg(link, file.getParent() + "/images/" + imgn);
// System.out.println(line.substring(start, end));
}
}
if (!isOut) {
writer.write(line);
}
writer.write("\n");
}
writer.close();
reader.close();
// File savefile = new File(css+".bak");
// if (!savefile.exists())
// savefile.createNewFile();
new FileOutputStream(css).write(baos.toByteArray());
baos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
private void dowmImg(String host, String saveTo) {
try {
URL url = new URL(host);
InputStream is = url.openStream();
File file = new File(saveTo);
File parent = file.getParentFile();
if (!parent.exists())
parent.mkdirs();
OutputStream os = new FileOutputStream(saveTo);
int n = 0;
byte[] b = new byte[2048];
while ((n = is.read(b)) > 0) {
os.write(b, 0, n);
}
os.close();
} catch (Exception e) {
// e.printStackTrace();
}
}
private void initHtmlFiles() {
if (files == null) {
if (htmlFilePath.endsWith(".htm"))
files = htmlFilePath.substring(0, htmlFilePath.length() - 4)
+ "_files";
else if (htmlFilePath.endsWith(".html"))
files = htmlFilePath.substring(0, htmlFilePath.length() - 5)
+ "_files";
}
}
private void parsePageUrl(String url) {
String[] arr = parseUrl(url);
domainPage = arr[0];
pathPage = arr[1];
domainCss = domainPage;
pathCss = pathPage;
}
/**
* [domain,path]
*
* @param url
* @return
*/
private String[] parseUrl(String url) {
int start = "http://".length();
if (url.startsWith("https://")) {
start++;
} else if (!url.startsWith("http://")) {
throw new RuntimeException(
"expected start with http:// or https://");
}
int pos = url.indexOf("/", start);
String domain;
String path = "/";
if (pos != -1) {
domain = url.substring(0, pos);
int end = url.lastIndexOf("/");
if (end > pos) {
path = url.substring(pos, end);
}
} else
domain = url;
return new String[] { domain, path };
}
/**
*
* @param name
* css file name
*/
private void parseCssLink(String name) {
if (cssLinks != null) {
String key = "/" + name;
String key2 = "/" + name.substring(0, name.length() - 3);
int find = -1;
int find2 = -1;
for (int idx = 0; idx < cssLinks.length; idx++) {
if (cssLinks[idx].indexOf(key) != -1) {
find = idx;
} else if (cssLinks[idx].indexOf(key2) != -1) {
find2 = idx;
}
}
if (find < 0)
find = find2;
if (find > -1) {
String[] arr = parseUrl(cssLinks[find]);
domainCss = arr[0];
pathCss = arr[1];
} else {
domainCss = domainPage;
pathCss = pathPage;
}
}
}
/**
* parseImgUrl
*
* @param url
* @return
*/
private String parseImgName(String url) {
int pos = url.lastIndexOf("/");
if (pos > 0) {
url = url.substring(pos + 1);
}
return url;
}
/**
* trim ../ or ./
*
* @param path
* @param link
* @return
*/
public String prefyUrl(String path, String link) {
int start = 0;
while ((start = link.indexOf("../", start)) != -1) {
link = link.substring(0, start) + link.substring(start + 3);
int pos = path.lastIndexOf("/");
if (pos == -1)
throw new RuntimeException("url is not corect.");
path = path.substring(0, pos);
}
if (link.indexOf("..") != -1)
throw new RuntimeException("url is not corect.");
else {
start = 0;
while ((start = link.indexOf("./", start)) != -1) {
link = link.substring(0, start) + link.substring(start + 2);
}
}
return path + "/" + link;
}
public static void main(String[] args) {
// set http proxy
// System.setProperty("http.proxyHost", "110.196.190.103");
// System.setProperty("http.proxyPort", "8080");
// String pageUrl = "http://news.baidu.com/";
// String htmlName =
// "C:/Users/Administrator/Desktop/tmp/百度新闻搜索——百度新闻搜索——全球最大的中文新闻平台_com-.htm";
String pageUrl = "http://buy.ccb.com/";
//网页另存为的路径。
String htmlFilePath = "C:/Users/Administrator/Desktop/tmp/善融商务个人商城-建设银行旗下B2C个人购物商城平台,支持信用卡分期和担保交易,品质保证。.htm";
IeHtmlTool tool = new IeHtmlTool(pageUrl, htmlFilePath);
tool.prefyCss();
}
}
package com.chruan.util;
import java.io.File;
import java.util.List;
public class DirUtil {
public static void dir(List list, File file, String type) {
if (file.isDirectory()) {
File[] fs = file.listFiles();
for (int idx = 0; idx < fs.length; idx++) {
dir(list, fs[idx], type);
}
} else {
String p = file.getAbsolutePath();
if (type != null && p.endsWith(type))
list.add(p);
}
}
}