html Demo工具类：网页使用ie另存为htm文件时，css样式文件的图片路径不对，进行修改，并下载图片

本文链接：https://blog.csdn.net/chruan/article/details/8820513

网页使用ie另存为htm文件时，css样式文件的图片路径不对，进行修改，并下载图片。

最近老是要做html Demo。就写了一个工具类。

package com.chruan.html.ie;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.chruan.util.DirUtil;

/**
 * 网页使用ie另存为htm文件时，css样式文件的图片路径不对，进行修改，并下载图片。
 * 
 * @version 1.0.0
 * @author chruan
 * 
 */
public class IeHtmlTool {

	/**
	 * 另存为网页的网址
	 */
	private String pageUrl;
	/**
	 * 网页另存为的htm文件路径
	 */
	private String htmlFilePath = null;
	/**
	 * _files文件夹
	 */
	private String files = null;

	private String domainPage = null;
	private String pathPage = "/";
	private String domainCss;
	private String pathCss;
	private String[] cssLinks = null;

	/**
	 * 
	 * @param pageUrl
	 * @param htmlFilePath
	 */
	public IeHtmlTool(String pageUrl, String htmlFilePath) {
		super();
		this.pageUrl = pageUrl.toLowerCase();
		parsePageUrl(pageUrl);
		this.htmlFilePath = htmlFilePath;
		initHtmlFiles();
	}

	/**
	 * get css links from pageUrl. format: <link href="url"/>
	 */
	private void spiderSylteLink() {
		try {
			List list = new ArrayList();
			URL url = new URL(pageUrl);
			InputStream is = url.openStream();

			int count = 0;

			// down html sourse.
			int n = 0;
			byte[] b = new byte[2048];
			StringBuffer sb = new StringBuffer();
			while ((n = is.read(b)) > 0) {
				String str1 = new String(b, 0, n);
				sb.append(str1);
			}

			// find link for css
			String str = sb.toString();
			int start = 0;
			while ((start = str.indexOf("<link", start)) > -1) {
				start = str.indexOf("href=\"", start + 5);
				start += "href=\"".length();
				int end = str.indexOf("\"", start);
				if (end > 0) {
					count++;
					String link = str.substring(start, end);
					if (link.startsWith("http")) {
					} else if (link.startsWith("/")) {
						link = domainPage + link;
					} else if (link.startsWith(".")) {
						link = prefyUrl(pathPage, link);
						link = domainPage + link;
					} else {
						link = domainPage + pathPage + link;
					}
					list.add(link);
					System.out.println("css link: " + link);
					start = end;
				}

			}

			System.out.println("total link: " + count);

			if (list.size() > 0) {
				cssLinks = (String[]) list.toArray(new String[list.size()]);
			}
			list.clear();
			list = null;
		} catch (Exception e) {
		}
	}

	/**
	 * parse css file and get img url. down imgs from url. rewrite img url in
	 * css.
	 */
	public void prefyCss() {
		spiderSylteLink();

		File fold = new File(files);
		String type = ".css";
		List list = new ArrayList();
		DirUtil.dir(list, fold, type);

		for (int idx = 0, size = list.size(); idx < size; idx++) {
			String css = (String) list.get(idx);
			downCssImg(css);
		}

	}

	private void downCssImg(String css) {
		File file = new File(css);
		parseCssLink(file.getName());
		try {
			ByteArrayOutputStream baos = new ByteArrayOutputStream();
			PrintWriter writer = new PrintWriter(baos);
			BufferedReader reader = new BufferedReader(new FileReader(file));
			String line;

			while ((line = reader.readLine()) != null) {
				boolean isOut = false;
				if (line.indexOf("url(") > -1) {
					int start = line.indexOf("url(") + 4;
					int end = line.indexOf(")", start);
					if (end > -1) {
						String img = line.substring(start, end);
						// System.out.println(img);
						String imgn = parseImgName(img);
						String newline = line.substring(0, start) + "images/"
								+ imgn + line.substring(end);
						isOut = true;
						writer.write(newline);

						String link = "";
						img = img.trim();
						if (img.startsWith("http:") || img.startsWith("https:")) {
							link = img;
						} else if (img.startsWith("/")) {
							link = domainCss + img;
						} else if (img.startsWith(".")) {
							link = domainCss + prefyUrl(pathCss, img);
						} else {
							link = domainCss + prefyUrl(pathCss, img);
						}
						dowmImg(link, file.getParent() + "/images/" + imgn);
						// System.out.println(line.substring(start, end));
					}
				}
				if (!isOut) {
					writer.write(line);
				}
				writer.write("\n");
			}
			writer.close();
			reader.close();
			// File savefile = new File(css+".bak");
			// if (!savefile.exists())
			// savefile.createNewFile();
			new FileOutputStream(css).write(baos.toByteArray());
			baos.close();
		} catch (Exception e) {
			e.printStackTrace();
		}

	}

	private void dowmImg(String host, String saveTo) {
		try {
			URL url = new URL(host);
			InputStream is = url.openStream();

			File file = new File(saveTo);
			File parent = file.getParentFile();
			if (!parent.exists())
				parent.mkdirs();
			OutputStream os = new FileOutputStream(saveTo);

			int n = 0;
			byte[] b = new byte[2048];
			while ((n = is.read(b)) > 0) {
				os.write(b, 0, n);
			}
			os.close();
		} catch (Exception e) {
			// e.printStackTrace();
		}

	}

	private void initHtmlFiles() {
		if (files == null) {
			if (htmlFilePath.endsWith(".htm"))
				files = htmlFilePath.substring(0, htmlFilePath.length() - 4)
						+ "_files";
			else if (htmlFilePath.endsWith(".html"))
				files = htmlFilePath.substring(0, htmlFilePath.length() - 5)
						+ "_files";
		}
	}

	private void parsePageUrl(String url) {
		String[] arr = parseUrl(url);
		domainPage = arr[0];
		pathPage = arr[1];

		domainCss = domainPage;
		pathCss = pathPage;
	}

	/**
	 * [domain,path]
	 * 
	 * @param url
	 * @return
	 */
	private String[] parseUrl(String url) {
		int start = "http://".length();
		if (url.startsWith("https://")) {
			start++;
		} else if (!url.startsWith("http://")) {
			throw new RuntimeException(
					"expected start with http:// or https://");
		}
		int pos = url.indexOf("/", start);
		String domain;
		String path = "/";
		if (pos != -1) {
			domain = url.substring(0, pos);
			int end = url.lastIndexOf("/");
			if (end > pos) {
				path = url.substring(pos, end);
			}

		} else
			domain = url;

		return new String[] { domain, path };
	}

	/**
	 * 
	 * @param name
	 *            css file name
	 */
	private void parseCssLink(String name) {
		if (cssLinks != null) {
			String key = "/" + name;
			String key2 = "/" + name.substring(0, name.length() - 3);
			int find = -1;
			int find2 = -1;
			for (int idx = 0; idx < cssLinks.length; idx++) {
				if (cssLinks[idx].indexOf(key) != -1) {
					find = idx;
				} else if (cssLinks[idx].indexOf(key2) != -1) {
					find2 = idx;
				}
			}
			if (find < 0)
				find = find2;
			if (find > -1) {
				String[] arr = parseUrl(cssLinks[find]);
				domainCss = arr[0];
				pathCss = arr[1];
			} else {
				domainCss = domainPage;
				pathCss = pathPage;
			}
		}
	}

	/**
	 * parseImgUrl
	 * 
	 * @param url
	 * @return
	 */
	private String parseImgName(String url) {
		int pos = url.lastIndexOf("/");
		if (pos > 0) {
			url = url.substring(pos + 1);
		}
		return url;
	}

	/**
	 * trim ../ or ./
	 * 
	 * @param path
	 * @param link
	 * @return
	 */
	public String prefyUrl(String path, String link) {
		int start = 0;
		while ((start = link.indexOf("../", start)) != -1) {
			link = link.substring(0, start) + link.substring(start + 3);
			int pos = path.lastIndexOf("/");
			if (pos == -1)
				throw new RuntimeException("url is not corect.");
			path = path.substring(0, pos);
		}
		if (link.indexOf("..") != -1)
			throw new RuntimeException("url is not corect.");
		else {
			start = 0;
			while ((start = link.indexOf("./", start)) != -1) {
				link = link.substring(0, start) + link.substring(start + 2);
			}
		}
		return path + "/" + link;
	}

	public static void main(String[] args) {
		// set http proxy
		// System.setProperty("http.proxyHost", "110.196.190.103");
		// System.setProperty("http.proxyPort", "8080");

		// String pageUrl = "http://news.baidu.com/";
		// String htmlName =
		// "C:/Users/Administrator/Desktop/tmp/百度新闻搜索——百度新闻搜索——全球最大的中文新闻平台_com-.htm";
		
		String pageUrl = "http://buy.ccb.com/";
		//网页另存为的路径。
		String htmlFilePath = "C:/Users/Administrator/Desktop/tmp/善融商务个人商城-建设银行旗下B2C个人购物商城平台，支持信用卡分期和担保交易，品质保证。.htm";

		IeHtmlTool tool = new IeHtmlTool(pageUrl, htmlFilePath);
		tool.prefyCss();
	}

}

package com.chruan.util;

import java.io.File;
import java.util.List;

public class DirUtil {

	public static void dir(List list, File file, String type) {
		if (file.isDirectory()) {
			File[] fs = file.listFiles();
			for (int idx = 0; idx < fs.length; idx++) {
				dir(list, fs[idx], type);
			}
		} else {
			String p = file.getAbsolutePath();
			if (type != null && p.endsWith(type))
				list.add(p);
		}
	}
}