java 保存网页

最新推荐文章于 2022-09-05 10:17:58 发布
lishigui
最新推荐文章于 2022-09-05 10:17:58 发布
阅读量3.9k
点赞数
分类专栏：转接技术文章标签： java html web mhtml net
本文链接：https://blog.csdn.net/lishigui/article/details/5749952
版权
转接技术专栏收录该内容
2 篇文章 0 订阅
订阅专栏
这是我从网上搜到的，用java将网页保存为mht格式，感觉不错，所以就共享一下给大家，让朋友们也学习学习！
需要用到的jar包有java mail,下载地址：http://java.sun.com/products/javamail/downloads/index.html
还有htmlparser，下载地址：http://sourceforge.net/projects/htmlparser/files/
package com.tag;   
import java.io.BufferedInputStream;
import java.io.UnsupportedEncodingException;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.StringTokenizer;

import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import javax.activation.DataHandler;
import javax.activation.DataSource;
import javax.activation.MimetypesFileTypeMap;
import javax.mail.Authenticator;
import javax.mail.Message;
import javax.mail.PasswordAuthentication;
import javax.mail.Session;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;

/**
 * 
 * mht文件解析类
 * 
 */
public class HtmlToMht {
	
	/** 网页编码 */
	private String strEncoding = null;
	
	// mht格式附加信息
	private String from = "lishigui@126.com";
	private String to = "lishigui@126.com";
	private String subject = "blog.csdn.net/lishigui";
	private String cc;
	private String bcc;
	
	public static void main(String[] args) {
		new HtmlToMht("http://blog.csdn.net/lishigui","C:");
	}

	/**
	 * 构造方法：初始化<br>
	 * 输入参数：strUrl 网页地址;  strFilePath 保存路径<br>
	 */
	public HtmlToMht(String strUrl, String strFilePath) {

		try {
			byte[] bText = null;
			//取得页面内容
			bText = downBinaryFile(strUrl);
			String strText = new String(bText);
			strEncoding = strText.split("charset=", 2)[1];
			strEncoding = strEncoding.split("\"")[0];
			System.err.println(strEncoding);
			try {
				strText = new String(bText, 0, bText.length, strEncoding);
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
			if (strText == null){
				return;
			}
			compile(new URL(strUrl),strText,strFilePath);

		} catch (MalformedURLException e) {
			e.printStackTrace();
			return;
		}
	}

	/**
	 * 方法说明：执行下载操作<br>
	 * 输入参数：strWeb 网页地址; strText 网页内容; strFilePath 保存路径<br>
	 * 返回类型：boolean<br>
	 */
	public boolean compile(URL strWeb, String strText, String strFilePath) {
		if (strWeb == null || strText == null || strFilePath == null){
			return false;
		}
		HashMap urlMap = new HashMap();
		NodeList nodes = new NodeList();
		try {
			Parser parser = createParser(strText);
			nodes = parser.parse(null);
		} catch (ParserException e) {
			e.printStackTrace();
		}

		URL strWebB = extractAllScriptNodes(nodes);
		if(strWebB == null || strWebB.equals("")){
			strWebB = strWeb;
		}
		ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap, strWebB);
		ArrayList urlImageList = extractAllImageNodes(nodes, urlMap, strWebB);
		if(strWebB == null || strWebB.equals("")){
			for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
				Map.Entry entry = (Map.Entry) iter.next();
				String key = (String) entry.getKey();
				String val = (String) entry.getValue();
				strText = strText.replace(val, key);
			}
		}
		
		try {
			createMhtArchive(strText, urlScriptList, urlImageList, strWeb, strFilePath);
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		}

		return true;

	}
	
	/**
	 * 方法说明：下载文件操作<br>
	 * 输入参数：url 文件路径<br>
	 * 返回类型：byte[]<br>
	 */
	public  byte[] downBinaryFile(String url){
		System.out.println(url);
		try {
			URL cUrl = new URL(url);
			URLConnection uc = cUrl.openConnection();
			// String contentType = this.strType;
			int contentLength = uc.getContentLength();
			if (contentLength > 0) {
				InputStream raw = uc.getInputStream();
				InputStream in = new BufferedInputStream(raw);
				byte[] data = new byte[contentLength];
				int bytesRead = 0;
				int offset = 0;
				while (offset < contentLength) {
					bytesRead = in.read(data, offset, data.length - offset);
					if (bytesRead == -1) {
						break;
					}
					offset += bytesRead;
				}
				in.close();
				raw.close();
				return data;
			}
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	
		return null;

	}

	/**
	 * 方法说明：建立HTML parser<br>
	 * 输入参数：inputHTML 网页文本内容<br>
	 * 返回类型：HTML parser<br>
	 */
	private Parser createParser(String inputHTML) {
		Lexer mLexer = new Lexer(new Page(inputHTML));
		return new Parser(mLexer, new DefaultParserFeedback(
				DefaultParserFeedback.QUIET));
	}

	/**
	 * 方法说明：抽取基础URL地址<br>
	 * 输入参数：nodes 网页标签集合<br>
	 * 返回类型：URL<br>
	 */
	private URL extractAllScriptNodes(NodeList nodes) {

		NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
		"BASE"), true);

		if (filtered != null && filtered.size() > 0) {
			Tag tag = (Tag) filtered.elementAt(0);
			String href = tag.getAttribute("href");
			if (href != null && href.length() > 0) {
				try {
					return new URL(href);
				} catch (MalformedURLException e) {
					e.printStackTrace();

				}
			}
		}
		return null;
	}

	/**
	 * 方法说明：抽取网页包含的css,js链接<br>
	 * 输入参数：nodes 网页标签集合; urlMap 已存在的url集合<br>
	 * 返回类型：css,js链接的集合<br>
	 */
	private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap,
			URL strWeb) {

		ArrayList urlList = new ArrayList();
		NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
				"script"), true);
		//遍历页面所有的script结点
		for (int i = 0; i < filtered.size(); i++) {
			Tag tag = (Tag) filtered.elementAt(i);
			String src = tag.getAttribute("src");
			System.out.println("script src="+src);
			// Handle external css file's url
			if (src != null && src.length() > 0) {
				String innerURL = src;
				//取得绝对路径,即把?号后面的除掉
				String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
				if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
					urlMap.put(absoluteURL, innerURL);
					ArrayList urlInfo = new ArrayList();
					urlInfo.add(innerURL);
					urlInfo.add(absoluteURL);
					urlList.add(urlInfo);
				}
				tag.setAttribute("src", absoluteURL);
			}
		}

		filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"),true);
		for (int i = 0; i < filtered.size(); i++) {
			Tag tag = (Tag) filtered.elementAt(i);
			String type = tag.getAttribute("type");
			String rel = tag.getAttribute("rel");
			String href = tag.getAttribute("href");
			boolean isCssFile = false;
			if (rel != null) {
				isCssFile = rel.indexOf("stylesheet") != -1;
			} else if (type != null) {
				isCssFile |= type.indexOf("text/css") != -1;
			}

			if (isCssFile && href != null && href.length() > 0) {
				String innerURL = href;
				System.out.println("css link="+href);
				String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
				if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
					urlMap.put(absoluteURL, innerURL);
					ArrayList urlInfo = new ArrayList();
					urlInfo.add(innerURL);
					urlInfo.add(absoluteURL);
					urlList.add(urlInfo);
				}
				tag.setAttribute("href", absoluteURL);
			}
		}

		return urlList;

	}

	/**
	 * 方法说明：抽取网页包含的图像链接<br>
	 * 输入参数：nodes 网页标签集合; urlMap 已存在的url集合; strWeb 网页地址<br>
	 * 返回类型：图像链接集合<br>
	 */
	private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap,
			URL strWeb) {

		ArrayList urlList = new ArrayList();
		NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
				"IMG"), true);

		for (int i = 0; i < filtered.size(); i++) {
			Tag tag = (Tag) filtered.elementAt(i);
			String src = tag.getAttribute("src");
			System.out.println("IMG src="+src);
			// Handle external css file's url
			if (src != null && src.length() > 0) {
				String innerURL = src;
				String absoluteURL = makeAbsoluteURL(strWeb, innerURL);
				if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {
					urlMap.put(absoluteURL, innerURL);
					ArrayList urlInfo = new ArrayList();
					urlInfo.add(innerURL);
					urlInfo.add(absoluteURL);
					urlList.add(urlInfo);
				}
				tag.setAttribute("src", absoluteURL);
			}
		}
		return urlList;
	}

	/**
	 * 方法说明：相对路径转绝对路径<br>
	 * 输入参数：strWeb 网页地址; innerURL 相对路径链接<br>
	 * 返回类型：绝对路径链接<br>
	 */
	public  String makeAbsoluteURL(URL strWeb, String innerURL) {

		// TODO Auto-generated method stub
		// 去除后缀(即参数去掉)
		int pos = innerURL.indexOf("?");
		if (pos != -1) {
			innerURL = innerURL.substring(0, pos);
		}
		if(strWeb == null || strWeb.equals("")){
			if(innerURL.startsWith("//")){
				innerURL = "http:"+innerURL;
			}
		}
		if (innerURL != null
		&& innerURL.toLowerCase().indexOf("http") == 0) {
			return innerURL;
		}
		URL linkUri = null;
		try {
			linkUri = new URL(strWeb, innerURL);
		} catch (MalformedURLException e) {
			e.printStackTrace();
			return null;

		}

		String absURL = linkUri.toString();
		absURL = absURL.replace("../", "");
		absURL = absURL.replace("./", "");
		System.out.println(absURL);

		return absURL;

	}

	/**
	 * 方法说明：创建mht文件<br>
	 * 输入参数：content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合
	 * strWeb 网页地址； strFilePath 保存路径<br>
	 * 返回类型：<br>
	 */
	private void createMhtArchive(String content, ArrayList urlScriptList,
			ArrayList urlImageList, URL strWeb, String strFilePath) throws Exception {

		// Instantiate a Multipart object
		MimeMultipart mp = new MimeMultipart("related");

		Properties properties = new Properties();
		// 设置系统属性
		properties = System.getProperties();
		properties.put("mail.smtp.host", "smtp.126.com");
		properties.put("mail.smtp.auth", "true");
		// 邮件会话对象
		Session session = Session.getDefaultInstance(properties,
				new Email_auth(from, ""));

		// props.put("mail.smtp.host", smtp);
		MimeMessage msg = new MimeMessage(session);

		// set mailer
		msg.setHeader("X-Mailer", "Code Manager .SWT");

		// set from
		if (from != null) {
			msg.setFrom(new InternetAddress(from));
		}

		// set subject
		if (subject != null) {
			msg.setSubject(subject);
		}

		// to
		if (to != null) {
			InternetAddress[] toAddresses = getInetAddresses(to);
			msg.setRecipients(Message.RecipientType.TO, toAddresses);

		}

		// cc
		if (cc != null) {
			InternetAddress[] ccAddresses = getInetAddresses(cc);
			msg.setRecipients(Message.RecipientType.CC, ccAddresses);
		}

		// bcc
		if (bcc != null) {
			InternetAddress[] bccAddresses = getInetAddresses(bcc);
			msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
		}

		// 设置网页正文
		MimeBodyPart bp = new MimeBodyPart();
		bp.setText(content, strEncoding);
		bp.addHeader("Content-Type", "text/html;charset=" + strEncoding);
		bp.addHeader("Content-Location", strWeb.toString());
		mp.addBodyPart(bp);

		int urlCount = urlScriptList.size();

		for (int i = 0; i < urlCount; i++) {

			bp = new MimeBodyPart();
			ArrayList urlInfo = (ArrayList) urlScriptList.get(i);
			String absoluteURL = urlInfo.get(1).toString();

			bp.addHeader("Content-Location",javax.mail.internet.MimeUtility
			.encodeWord(java.net.URLDecoder.decode(absoluteURL, strEncoding)));

			DataSource source = new AttachmentDataSource(absoluteURL, "text");
			bp.setDataHandler(new DataHandler(source));

			mp.addBodyPart(bp);

		}

		urlCount = urlImageList.size();

		for (int i = 0; i < urlCount; i++) {

			bp = new MimeBodyPart();
			ArrayList urlInfo = (ArrayList) urlImageList.get(i);

			// String url = urlInfo.get(0).toString();
			String absoluteURL = urlInfo.get(1).toString();
			bp.addHeader("Content-Location",javax.mail.internet.MimeUtility
			.encodeWord(java.net.URLDecoder.decode(absoluteURL, strEncoding)));

			DataSource source = new AttachmentDataSource(absoluteURL, "image");
			bp.setDataHandler(new DataHandler(source));

			mp.addBodyPart(bp);
		}
		msg.setContent(mp);
		// write the mime multi part message to a file
		msg.writeTo(new FileOutputStream(strFilePath+"//"+strWeb.toString().split("/")[strWeb.toString().split("/").length-1]+".mht"));
		// Transport.send(msg);

	}

	private InternetAddress[] getInetAddresses(String emails) throws Exception {
		ArrayList list = new ArrayList();
		StringTokenizer tok = new StringTokenizer(emails, ",");
		while (tok.hasMoreTokens()) {
			list.add(tok.nextToken());
		}
		int count = list.size();
		InternetAddress[] addresses = new InternetAddress[count];
		for (int i = 0; i < count; i++) {
			addresses[i] = new InternetAddress(list.get(i).toString());
		}
		return addresses;

	}

	class AttachmentDataSource implements DataSource {

		private MimetypesFileTypeMap map = new MimetypesFileTypeMap();
		private String strUrl;
		private String strType;
		private byte[] dataSize = null;

		/**
		 * 
		 * This is some content type maps.
		 */
		private Map normalMap = new HashMap();
		{
			// Initiate normal mime type map
			// Images
			normalMap.put("image", "image/jpeg");
			normalMap.put("text", "text/plain");

		}

		public AttachmentDataSource(String strUrl, String strType) {
			this.strType = strType;
			this.strUrl = strUrl;
			strUrl = strUrl.trim();
			strUrl = strUrl.replaceAll(" ", "%20");
			dataSize = downBinaryFile(strUrl);

		}

		public String getContentType() {
			return getMimeType(getName());
		}

		public String getName() {
			char separator = File.separatorChar;
			if (strUrl.lastIndexOf(separator) >= 0)
				return strUrl.substring(strUrl.lastIndexOf(separator) + 1);
			return strUrl;

		}

		private String getMimeType(String fileName) {
			String type = (String) normalMap.get(strType);
			if (type == null) {
				try {
					type = map.getContentType(fileName);
				} catch (Exception e) {
				}
				if (type == null) {
					type = "application/octet-stream";
				}
			}
			return type;

		}

		public InputStream getInputStream() throws IOException {
			if (dataSize == null)
				dataSize = new byte[0];
			return new ByteArrayInputStream(dataSize);
		}

		public OutputStream getOutputStream() throws IOException {
			return new java.io.ByteArrayOutputStream();
		}

	}

	class Email_auth extends Authenticator {

		String auth_user;
		String auth_password;

		public Email_auth() {
			super();
		}

		public Email_auth(String user, String password) {
			super();
			setUsername(user);
			setUserpass(password);

		}

		public void setUsername(String username) {
			auth_user = username;
		}

		public void setUserpass(String userpass) {
			auth_password = userpass;
		}

		public PasswordAuthentication getPasswordAuthentication() {
			return new PasswordAuthentication(auth_user, auth_password);
		}

	}

}
lishigui
关注
0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
java 保存网页

 这是我从网上搜到的，用java将网页保存为mht格式，感觉不错，所以就共享一下给大家，让朋友们也学习学习！ 需要用到的jar包有java mail,下载地址：http://java.sun.com/products/javamail/downloads/index.html 还有htmlparser，下载地址：http://sourceforge.net/projects/htmlparser/files/ 
复制链接

扫一扫