java html 转成mht_使用java将网页保存为mht格式(1)

最新推荐文章于 2022-11-17 17:08:41 发布

Geequlim

最新推荐文章于 2022-11-17 17:08:41 发布

阅读量322

点赞数

文章标签： java html 转成mht

本文链接：https://blog.csdn.net/weixin_36378669/article/details/114191993

版权

使用java将网页保存为mht格式(1)

2009年1月5日来源：233网校网校课程在线题库评论

分享到

public boolean compile() {

if (strWeb == null || strText == null || strFileName == null || strEncoding == null)

return false;

HashMap urlMap = new HashMap();

NodeList nodes = new NodeList();

try {

Parser parser = createParser(strText);

parser.setEncoding(strEncoding);

nodes = parser.parse(null);

} catch (ParserException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

extractAllScriptNodes(nodes);

ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);

ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);

for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {

Map.Entry entry = (Map.Entry) iter.next();

String key = (String)entry.getKey();

String val = (String)entry.getValue();

strText = JHtmlClear.replace(strText, val, key);

}

try {

createMhtArchive(strText, urlScriptList, urlImageList);

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

return false;

}

return true;

}

/**

*方法说明：建立HTML parser

*输入参数：inputHTML 网页文本内容

*返回类型：HTML parser

private Parser createParser(String inputHTML) {

// TODO Auto-generated method stub

Lexer mLexer = new Lexer(new Page(inputHTML));

return new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));

}

/**

*方法说明：抽取基础URL地址

*输入参数：nodes 网页标签集合

*返回类型：

private void extractAllScriptNodes(NodeList nodes) {

NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(

"BASE"), true);

if (filtered != null && filtered.size() > 0) {

Tag tag = (Tag) filtered.elementAt(0);

String href = tag.getAttribute("href");

if (href != null && href.length() > 0) {

try {

strWeb = new URL(href);

} catch (MalformedURLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

/**

*方法说明：抽取网页包含的css,js链接

*输入参数：nodes 网页标签集合; urlMap 已存在的url集合

*返回类型：css,js链接的集合

private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {

ArrayList urlList = new ArrayList();

NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("script"), true);

for (int i = 0; i < filtered.size(); i++) {

Tag tag = (Tag) filtered.elementAt(i);

String src = tag.getAttribute("src");

// Handle external css file’s url

if (src != null && src.length() > 0) {

String innerURL = src;

String absoluteURL = makeAbsoluteURL(strWeb, innerURL);

if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {

urlMap.put(absoluteURL, innerURL);

ArrayList urlInfo = new ArrayList();

urlInfo.add(innerURL);

urlInfo.add(absoluteURL);

urlList.add(urlInfo);

}

tag.setAttribute("src", absoluteURL);

}

filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"), true);

for (int i = 0; i < filtered.size(); i++) {

Tag tag = (Tag) filtered.elementAt(i);

String type = (tag.getAttribute("type"));

String rel = (tag.getAttribute("rel"));

String href = tag.getAttribute("href");

boolean isCssFile = false;

if (rel != null) {

isCssFile = rel.indexOf("stylesheet") != -1;

} else if (type != null) {

isCssFile |= type.indexOf("text/css") != -1;

}

// Handle external css file’s url

if (isCssFile && href != null && href.length() > 0) {

String innerURL = href;

String absoluteURL = makeAbsoluteURL(strWeb, innerURL);

if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {

urlMap.put(absoluteURL, innerURL);

ArrayList urlInfo = new ArrayList();

urlInfo.add(innerURL);

urlInfo.add(absoluteURL);

urlList.add(urlInfo);

}

tag.setAttribute("href", absoluteURL);

}

return urlList;

}

/**

*方法说明：抽取网页包含的图像链接

*输入参数：nodes 网页标签集合; urlMap 已存在的url集合

*返回类型：图像链接集合

分享到：

责编：zj评论

Geequlim

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java html 转成mht_使用java将网页保存为mht格式(1)

使用java将网页保存为mht格式(1)2009年1月5日来源：233网校网校课程在线题库评论分享到public boolean compile() {if (strWeb == null || strText == null || strFileName == null || strEncoding == null)return false;HashMap urlMap = new HashM...
复制链接

扫一扫