java mht 转html_Java mht转换成html

最新推荐文章于 2021-06-04 05:33:37 发布

weixin_39991222

最新推荐文章于 2021-06-04 05:33:37 发布

阅读量724

点赞数

文章标签： java mht 转html

本文链接：https://blog.csdn.net/weixin_39991222/article/details/114076877

版权

import java.io.BufferedInputStream;

import java.io.BufferedOutputStream;

import java.io.BufferedReader;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.io.Reader;

import java.io.Writer;

import java.util.Enumeration;

import javax.activation.DataHandler;

import javax.mail.MessagingException;

import javax.mail.Multipart;

import javax.mail.Session;

import javax.mail.internet.MimeBodyPart;

import javax.mail.internet.MimeMessage;

import javax.mail.internet.MimeMultipart;

import javax.mail.internet.MimePartDataSource;

public class HtmlApplication{

public static void main(String[] args){

HtmlApplication.mht2html("C:\\Documents and Settings\\Administrator\\桌面\\test2.mht", "C:\\test2\\test.html");

}

/**

* 将 mht文件转换成 html文件

* @param s_SrcMht

* @param s_DescHtml

public static void mht2html(String s_SrcMht, String s_DescHtml) {

try {

InputStream fis = new FileInputStream(s_SrcMht);

Session mailSession = Session.getDefaultInstance(System.getProperties(), null);

MimeMessage msg = new MimeMessage(mailSession, fis);

Object content = msg.getContent();

if (content instanceof Multipart){

MimeMultipart mp = (MimeMultipart)content;

MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);

//获取mht文件内容代码的编码

String strEncodng = getEncoding(bp1);

//获取mht文件的内容

String strText = getHtmlText(bp1, strEncodng);

if (strText == null)

return;

//创建以mht文件名称的文件夹，主要用来保存资源文件。

File parent = null;

if (mp.getCount() > 1) {

parent = new File(new File(s_DescHtml).getAbsolutePath() + ".files");

parent.mkdirs();

if (!parent.exists()){ //创建文件夹失败的话则退出

return;

}

//FOR中代码主要是保存资源文件及替换路径

for (int i = 1; i < mp.getCount(); ++i) {

MimeBodyPart bp = (MimeBodyPart)mp.getBodyPart(i);

//获取资源文件的路径

//例(获取： http://xxx.com/abc.jpg)

String strUrl = getResourcesUrl(bp);

if (strUrl==null || strUrl.length()==0)

continue;

DataHandler dataHandler = bp.getDataHandler();

MimePartDataSource source = (MimePartDataSource)dataHandler.getDataSource();

//获取资源文件的绝对路径

String FilePath = parent.getAbsolutePath() + File.separator + getName(strUrl, i);

File resources = new File(FilePath);

//保存资源文件

if (SaveResourcesFile(resources, bp.getInputStream())){

//将远程地址替换为本地地址如图片、JS、CSS样式等等

strText = strText.replace(strUrl, resources.getAbsolutePath());

}

//最后保存HTML文件

SaveHtml(strText, s_DescHtml, strEncodng);

}

} catch (Exception e) {

e.printStackTrace();

}

/**

* 获取mht文件内容中资源文件的名称

* @param strName

* @param ID

* @return

public static String getName(String strName, int ID) {

char separator1 = '/';

char separator2 = '\\';

//将换行替换

strName = strName.replaceAll("\r\n", "");

//获取文件名称

if( strName.lastIndexOf(separator1) >= 0){

return strName.substring(strName.lastIndexOf(separator1) + 1);

}

if( strName.lastIndexOf(separator2) >= 0){

return strName.substring(strName.lastIndexOf(separator2) + 1);

}

return "";

}

/**

* 将提取出来的html内容写入保存的路径中。

* @param strText

* @param strHtml

* @param strEncodng

public static boolean SaveHtml(String s_HtmlTxt, String s_HtmlPath , String s_Encode) {

try{

Writer out = null;

out = new OutputStreamWriter(new FileOutputStream(s_HtmlPath, false), s_Encode);

out.write(s_HtmlTxt);

out.close();

}catch(Exception e){

return false;

}

return true;

}

/**

* 保存网页中的JS、图片、CSS样式等资源文件

* @param SrcFile 源文件

* @param inputStream 输入流

* @return

private static boolean SaveResourcesFile(File SrcFile, InputStream inputStream) {

if (SrcFile == null || inputStream == null) {

return false;

}

BufferedInputStream in = null;

FileOutputStream fio = null;

BufferedOutputStream osw = null;

try {

in = new BufferedInputStream(inputStream);

fio = new FileOutputStream(SrcFile);

osw = new BufferedOutputStream(new DataOutputStream(fio));

int index = 0;

byte[] a = new byte[1024];

while ((index = in.read(a)) != -1) {

osw.write(a, 0, index);

}

osw.flush();

return true;

} catch (Exception e) {

e.printStackTrace();

return false;

} finally{

try {

if (osw != null)

osw.close();

if (fio != null)

fio.close();

if (in != null)

in.close();

if (inputStream != null)

inputStream.close();

} catch (Exception e) {

e.printStackTrace();

return false;

}

/**

* 获取mht文件里资源文件的URL路径

* @param bp

* @return

private static String getResourcesUrl(MimeBodyPart bp) {

if(bp==null){

return null;

}

try {

Enumeration list = bp.getAllHeaders();

while (list.hasMoreElements()) {

javax.mail.Header head = (javax.mail.Header)list.nextElement();

if (head.getName().compareTo("Content-Location") == 0) {

return head.getValue();

}

return null;

} catch (MessagingException e) {

return null;

}

/**

* 获取mht文件中的内容代码

* @param bp

* @param strEncoding 该mht文件的编码

* @return

private static String getHtmlText(MimeBodyPart bp, String strEncoding) {

InputStream textStream = null;

BufferedInputStream buff = null;

BufferedReader br = null;

Reader r = null;

try {

textStream = bp.getInputStream();

buff = new BufferedInputStream(textStream);

r = new InputStreamReader(buff, strEncoding);

br = new BufferedReader(r);

StringBuffer strHtml = new StringBuffer("");

String strLine = null;

while ((strLine = br.readLine()) != null) {

strHtml.append(strLine + "\r\n");

}

br.close();

r.close();

textStream.close();

return strHtml.toString();

} catch (Exception e) {

e.printStackTrace();

} finally{

try{

if (br != null)

br.close();

if (buff != null)

buff.close();

if (textStream != null)

textStream.close();

}catch(Exception e){

}

return null;

}

/**

* 获取mht网页文件中内容代码的编码

* @param bp

* @return

private static String getEncoding(MimeBodyPart bp) {

if(bp==null){

return null;

}

try {

Enumeration list = bp.getAllHeaders();

while (list.hasMoreElements()) {

javax.mail.Header head = (javax.mail.Header)list.nextElement();

if (head.getName().compareTo("Content-Type") == 0) {

String strType = head.getValue();

int pos = strType.indexOf("charset=");

if (pos>=0) {

String strEncoding = strType.substring(pos + 8, strType.length());

if(strEncoding.startsWith("\"") || strEncoding.startsWith("\'")){

strEncoding = strEncoding.substring(1 , strEncoding.length());

}

if(strEncoding.endsWith("\"") || strEncoding.endsWith("\'")){

strEncoding = strEncoding.substring(0 , strEncoding.length()-1);

}

if (strEncoding.toLowerCase().compareTo("gb2312") == 0) {

strEncoding = "gbk";

}

return strEncoding;

}

} catch (MessagingException e) {

e.printStackTrace();

}

return null;

}

mail.jar (339 KB)

下载次数: 247

分享到：

2011-07-21 09:46

12 楼

XiaoweiGly

2015-01-26

逗比啊！人家是html转mht，你还连接到你这里你是mht转html好吗。。

11 楼

dl96200

2014-01-03

非常感谢楼主分享宝贵的经验技术，在您的html转成mht格式的博文中，mht文件显示为乱码，如何操作，但数据库的数据是正常显示的，html文件为utf-8,设置的时候也都如此，是什么原因造成的呢

10 楼

xiaoll880214

2013-11-07

删去 getEncoding,getHtmlText方法。可以避免出现乱码，有部分mht文件拿第一个Multipart的时候拿不到charset="utf-8"，因此会导致解析出现乱码，以下为更改后的方法。

9 楼

xiaoll880214

2013-11-07

修改后的mht2html方法：

/**

* mht文件转html文件

* @param mhtFile

* @param htmlFile

public static void mht2html(String mhtFile, String htmlFile)

{

InputStream fis = null;

try

{

/*该段代码同楼主发布部分*/

MimeMessage msg = new MimeMessage(mailSession, fis);

StringBuffer sb = new StringBuffer("");

getMailContent((Part) msg, sb);

// 获取邮件正文文本

String strText = sb.toString();

// 获取消息体

Object content = msg.getContent();

if (content instanceof Multipart)

{

// 带附件的消息

MimeMultipart mp = (MimeMultipart) content;

// 获取消息Body

MimeBodyPart mbp = (MimeBodyPart) mp.getBodyPart(0);

if (StringUtils.isEmpty(strText))

{

return;

}

File parent = null;

if (mp.getCount() > 1)

{

// 获取抽象路径名的绝对路径名，用于保存资源文件

parent = new File(new File(htmlFile).getAbsolutePath()

+ ".files");

parent.mkdirs();

if (!parent.exists())

{

return;

}

for (int i = 1; i < mp.getCount(); i++)

{

mbp = (MimeBodyPart) mp.getBodyPart(i);

// 获取url连接

String strUrl = getResourceUrl(mbp);

if (StringUtils.isEmpty(strUrl))

{

continue;

}

// 存储资源文件

File resources = new File(parent.getAbsolutePath()

+ File.separator + getName(strUrl, i));

if (saveResourcesFile(resources, mbp.getInputStream()))

{

strText = replace(strText, strUrl,

resources.getAbsolutePath());

}

// 保存html文件

saveHtml(strText, htmlFile);

}

catch (Exception e)

{

e.printStackTrace();

}

finally

{

IOUtils.closeQuietly(fis);

}

8 楼

xiaoll880214

2013-11-07

/**

* 解析邮件，将得到的邮件内容保存到一个stringBuffer对象中，解析邮件主要根据MimeType的不同执行不同的操作, 一步一步的解析

* 递归

* @param part

* Part邮件消息

* @throws MessagingException

* @throws IOException

private static String getMailContent(Part msg, StringBuffer bodytext)

throws MessagingException, IOException

{

String contentType = msg.getContentType();

int nameindex = contentType.indexOf("name");

boolean conname = false;

if (nameindex != -1)

{

conname = true;

}

// 纯文本格式的,可以直接解析掉

if (msg.isMimeType("text/plain") && !conname)

{

bodytext.append((String) msg.getContent());

}

// html格式的,可以直接解析掉

else if (msg.isMimeType("text/html") && !conname)

{

bodytext.append((String) msg.getContent());

}

// 附件处理

else if (msg.isMimeType("multipart/*"))

{

Multipart multipart = (Multipart) msg.getContent();

int count = multipart.getCount();

for (int i = 0; i < count; i++)

{

getMailContent(multipart.getBodyPart(i), bodytext);

}

else if (msg.isMimeType("message/rfc822"))

{

getMailContent((Part) msg.getContent(), bodytext);

}

return bodytext.toString();

}

7 楼

xiaoll880214

2013-11-07

终于调好了。改掉了获取所有征文内容的方法，删掉getEncoding,getHtmlText方法。目前没有碰到乱码的问题。依楼主现在的做法，有些mht文件头部没有charset信息，无法获取编码格式，因此获取htmltext的时候解出来的很多都是乱码。mht跟邮件获正文内容相似，回头贴上获取征文内容的方法：