package com.tag;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import toptrack.tools.JQuery;
import javax.activation.DataHandler;
import javax.activation.DataSource;
import javax.activation.MimetypesFileTypeMap;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Session;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import javax.mail.internet.MimePartDataSource;
/**
* mht文件解析类
* @author dl
*/
public class Html2MHTCompiler {
private URL strWeb = null; /**网页地址*/
private String strText = null; /**网页文本内容*/
private String strFileName = null; /**本地文件名*/
private String strEncoding = null; /**网页编码*/
//mht格式附加信息
private String from = "dongle2001@126.com";
private String to;
private String subject = "mht compile";
private String cc;
private String bcc;
private String smtp = "localhost";
public static void main(String[] args) {
String strUrl = "http://www.mtime.com/my/tropicofcancer/blog/843555/";
String strEncoding = "utf-8";
String strText = JQuery.getHtmlText(strUrl, strEncoding, null);
if (strText == null)
return;
Html2MHTCompiler h2t = new Html2MHTCompiler(strText, strUrl, strEncoding, "test.mht");
h2t.compile();
//Html2MHTCompiler.mht2html("test.mht", "a.html");
}
/**
*<br>方法说明:初始化
*<br>输入参数:strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名
*<br>返回类型:
*/
public Html2MHTCompiler(String strText, String strUrl, String strEncoding, String strFileName) {
// TODO Auto-generated constructor stub
try {
strWeb = new URL(strUrl);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return;
}
this.strText = strText;
this.strEncoding = strEncoding;
this.strFileName = strFileName;
}
/**
*<br>方法说明:执行下载操作
*<br>输入参数:
*<br>返回类型:
*/
public boolean compile() {
if (strWeb == null || strText == null || strFileName == null || strEncoding == null)
return false;
HashMap urlMap = new HashMap();
NodeList nodes = new NodeList();
try {
Parser parser = createParser(strText);
parser.setEncoding(strEncoding);
nodes = parser.parse(null);
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
extractAllScriptNodes(nodes);
ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String)entry.getKey();
String val = (String)entry.getValue();
strText = JHtmlClear.replace(strText, val, key);
}
try {
createMhtArchive(strText, urlScriptList, urlImageList);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
/**
*<br>方法说明:建立HTML parser
*<br>输入参数:inputHTML 网页文本内容
*<br>返回类型:HTML parser
*/
private Parser createParser(String inputHTML) {
// TODO Auto-generated method stub
Lexer mLexer = new Lexer(new Page(inputHTML));
return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
}
/**
*<br>方法说明:抽取基础URL地址
*<br>输入参数:nodes 网页标签集合
*<br>返回类型:
*/
private void extractAllScriptNodes(NodeList nodes) {
NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(
"BASE"), true);
if (filtered != null && filtered.size() > 0) {
Tag tag = (Tag) filtered.elem
使用java将网页保存为mht格式
最新推荐文章于 2022-11-17 17:08:41 发布
这是一个Java程序,用于将网页内容转换为MHT格式。它使用HTMLParser库来解析网页,提取其中的脚本和图片链接,并将它们转换为绝对URL。然后,程序将内容和资源打包到MHT档案中,支持相关的邮件处理和数据存储功能。
摘要由CSDN通过智能技术生成