使用java将网页保存为mht格式

http://blog.csdn.net/dongle2001/archive/2008/06/17/2557434.aspx#826147

 

 http://thinkgem.iteye.com/blog/724208


package org.macau.test;

import java.io.BufferedInputStream;  
import java.io.BufferedOutputStream;  
import java.io.BufferedReader;  
import java.io.ByteArrayInputStream;  
import java.io.ByteArrayOutputStream;  
import java.io.DataInputStream;  
import java.io.DataOutputStream;  
import java.io.File;  
import java.io.FileInputStream;  
import java.io.FileOutputStream;  
import java.io.FileWriter;  
import java.io.IOException;  
import java.io.InputStream;  
import java.io.InputStreamReader;  
import java.io.OutputStream;  
import java.io.Reader;  
import java.io.UnsupportedEncodingException;  
import java.net.HttpURLConnection;  
import java.net.MalformedURLException;  
import java.net.URL;  
import java.net.URLEncoder;  
import java.util.*;  
 
import org.htmlparser.Parser;  
import org.htmlparser.Tag;  
import org.htmlparser.filters.TagNameFilter;  
import org.htmlparser.lexer.Lexer;  
import org.htmlparser.lexer.Page;  
import org.htmlparser.util.DefaultParserFeedback;  
import org.htmlparser.util.NodeList;  
import org.htmlparser.util.ParserException;  
 
import javax.activation.DataHandler;  
import javax.activation.DataSource;  
import javax.activation.MimetypesFileTypeMap;  
import javax.mail.Message;  
import javax.mail.MessagingException;  
import javax.mail.Multipart;  
import javax.mail.Session;  
import javax.mail.internet.InternetAddress;  
import javax.mail.internet.MimeBodyPart;  
import javax.mail.internet.MimeMessage;  
import javax.mail.internet.MimeMultipart;  
import javax.mail.internet.MimePartDataSource;  
 
/** 
 * 
 * MHT文件解析类 
 * 
 */ 
@SuppressWarnings("unchecked")  
public class Html2MHTCompiler {  
    private URL strWeb = null;  
    /** 网页地址 */ 
    private String strText = null;  
    /** 网页文本内容 */ 
    private String strFileName = null;  
    /** 本地文件名 */ 
    private String strEncoding = null;  
    /** 网页编码 */ 
 
    // MHT格式附加信息  
    private String from = "thinkgem@gmail.com";  
    private String to;  
    private String subject;  
    private String cc;  
    private String bcc;  
    private String smtp = "localhost";  
 
    public static void main(String[] args) {  
        String strUrl = "http://edition.cnn.com/2010/WORLD/europe/08/20/berlusconi.ft/index.html?eref=edition";  
        String strEncoding = "utf-8";  
        String strText1 = JQuery.getHtmlText(strUrl, strEncoding);  
        //String strText2 = "<img src=\"http://www.imathas.com/cgi-bin/mimetex.cgi?sqrt{2}\"/><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" /><link rel=\"stylesheet\" type=\"text/css\" href=\"http://192.168.1.2:8080/ibc/theme/default/style.css\" /><P><div class=paper_list>sdfsdf<div class=paper>dfkjsldjfl<table><tr><td>abc</td><td>abc</td></tr><tr><td>abc</td><td>abc</td></tr></table></div></div><IMG SRC=\"http://192.168.1.13/cc.jpg\"/><SPAN>sdfsdf</SPAN></P><p><span style=\"font-size: 10pt; color:#f00;\"><font face=\"宋体\">在下列各溶液中,离子一定能大量共存的是<span lang=\"EN-US\">                                (    )<o:p></o:p></span></font></span></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">A</span><span style=\"font-size: 10pt\">.强碱性溶液中:<span lang=\"EN-US\">K</span><sup>+</sup>、<span lang=\"EN-US\">S<sup>2-</sup></span>、<span lang=\"EN-US\">ClO</span><sup>-</sup>、<span lang=\"EN-US\">SO<sub>4</sub><sup>2</sup>< /span><sup>-</sup><span lang=\"EN-US\"> <o:p></o:p></span></span></font></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">B</span><span style=\"font-size: 10pt\">.含有<span lang=\"EN-US\">0.1mol</span></span></font><span style=\"font-family: "MS Mincho"; font-size: 10pt; mso-bidi-font-family: 'MS Mincho'\">?</span><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">L</span><sup><span style=\"font-size: 10pt\">-<span lang=\"EN-US\">1 </span></span></sup><span lang=\"EN-US\" style=\"font-size: 10pt\">Fe<sup>3</sup></span><sup><span style=\"font-size: 10pt\">+</span></sup><span style=\"font-size: 10pt\">的溶液中:<span lang=\"EN-US\">K</span><sup>+</sup>、<span lang=\"EN-US\">Mg<sup>2</sup></span><sup>+& lt;/sup>、<span lang=\"EN-US\">I</span><sup>-</sup>、<span lang=\"EN-US\">NO<sub>3</sub></span><sup>-& lt;/sup><span lang=\"EN-US\"><o:p></o:p></span></span></font></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">C</span><span style=\"font-size: 10pt\">.无色溶液中:<span lang=\"EN-US\">Na</span><sup>+</sup>、<span lang=\"EN-US\">K</span><sup>+</sup>、<span lang=\"EN-US\">CO<sub>3</sub><sup>2</sup>< /span><sup>-</sup>、<span lang=\"EN-US\">Cu<sup>2+</sup><o:p></o:p></span></span></font></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">D</span><span style=\"font-size: 10pt\">.室温下,<span lang=\"EN-US\">pH</span>=<span lang=\"EN-US\">1</span>的溶液中:<span lang=\"EN-US\">Na</span><sup>+</sup>、<span lang=\"EN-US\">Fe<sup>3</sup></span><sup>+& lt;/sup>、<span lang=\"EN-US\">NO<sub>3</sub></span><sup>-& lt;/sup>、<span lang=\"EN-US\">SO<sub>4</sub><sup>2</sup>< /span><sup>-</sup><span lang=\"EN-US\"> <o:p></o:p></span></span></font></p>` ( sqrt{2} )/(2) `<p> </p><script type=\"text/javascript\" src=\"http://192.168.1.2:8080/ibc/manage/js/ASCIIMathMLwFallback2.js\"></script>";  
        Html2MHTCompiler h2t = new Html2MHTCompiler(strText1, strUrl, strEncoding, "d:\\test.mht");  
        h2t.compile();  
        //Html2MHTCompiler.mht2html("c:\\test.mht", "c:\\test.htm");  
    }  
 
    /** 
     * 
     * 
     * 方法说明:初始化 
     * 
     * 
     * 输入参数:strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名 
     * 
     * 
     * 返回类型: 
     * 
     */ 
    public Html2MHTCompiler(String strText, String strUrl, String strEncoding,  
            String strFileName) {  
        try {  
            strWeb = new URL(strUrl);  
        } catch (MalformedURLException e) {  
 
            e.printStackTrace();  
            return;  
        }  
        this.strText = strText;  
        this.strEncoding = strEncoding;  
        this.strFileName = strFileName;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:执行下载操作 
     * 
     * 
     * 输入参数: 
     * 
     * 
     * 返回类型: 
     * 
     */ 
    public boolean compile() {  
        if (strWeb == null || strText == null || strFileName == null 
                || strEncoding == null)  
            return false;  
        HashMap urlMap = new HashMap();  
        NodeList nodes = new NodeList();  
        try {  
            Parser parser = createParser(strText);  
            parser.setEncoding(strEncoding);  
            nodes = parser.parse(null);  
        } catch (ParserException e) {  
            e.printStackTrace();  
        }  
        extractAllScriptNodes(nodes);  
        ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);  
        ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);  
        for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {  
            Map.Entry entry = (Map.Entry) iter.next();  
            String key = (String) entry.getKey();  
            String val = (String) entry.getValue();  
            strText = JHtmlClear.replace(strText, val, key);  
        }  
        try {  
            createMhtArchive(strText, urlScriptList, urlImageList);  
        } catch (Exception e) {  
 
            e.printStackTrace();  
            return false;  
        }  
        return true;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:建立HTML parser 
     * 
     * 
     * 输入参数:inputHTML 网页文本内容 
     * 
     * 
     * 返回类型:HTML parser 
     * 
     */ 
    private Parser createParser(String inputHTML) {  
 
        Lexer mLexer = new Lexer(new Page(inputHTML));  
        return new Parser(mLexer, new DefaultParserFeedback(  
                DefaultParserFeedback.QUIET));  
    }  
 
    /** 
     * 
     * 
     * 方法说明:抽取基础URL地址 
     * 
     * 
     * 输入参数:nodes 网页标签集合 
     * 
     * 
     * 返回类型: 
     * 
     */ 
    private void extractAllScriptNodes(NodeList nodes) {  
        NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(  
                "BASE"), true);  
        if (filtered != null && filtered.size() > 0) {  
            Tag tag = (Tag) filtered.elementAt(0);  
            String href = tag.getAttribute("href");  
            if (href != null && href.length() > 0) {  
                try {  
                    strWeb = new URL(href);  
                } catch (MalformedURLException e) {  
 
                    e.printStackTrace();  
                }  
            }  
        }  
    }  
 
    /** 
     * 
     * 
     * 方法说明:抽取网页包含的css,js链接 
     * 
     * 
     * 输入参数:nodes 网页标签集合; urlMap 已存在的url集合 
     * 
     * 
     * 返回类型:css,js链接的集合 
     * 
     */ 
    private ArrayList extractAllScriptNodes(NodeList nodes, HashMap urlMap) {  
        ArrayList urlList = new ArrayList();  
        NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(  
                "script"), true);  
        for (int i = 0; i < filtered.size(); i++) {  
            Tag tag = (Tag) filtered.elementAt(i);  
            String src = tag.getAttribute("src");  
            // Handle external css file's url  
            if (src != null && src.length() > 0) {  
                String innerURL = src;  
                String absoluteURL = makeAbsoluteURL(strWeb, innerURL);  
                if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {  
                    urlMap.put(absoluteURL, innerURL);  
                    ArrayList urlInfo = new ArrayList();  
                    urlInfo.add(innerURL);  
                    urlInfo.add(absoluteURL);  
                    urlList.add(urlInfo);  
                }  
                tag.setAttribute("src", absoluteURL);  
            }  
        }  
 
        filtered = nodes.extractAllNodesThatMatch(new TagNameFilter("link"),  
                true);  
        for (int i = 0; i < filtered.size(); i++) {  
            Tag tag = (Tag) filtered.elementAt(i);  
            String type = (tag.getAttribute("type"));  
            String rel = (tag.getAttribute("rel"));  
            String href = tag.getAttribute("href");  
 
            boolean isCssFile = false;  
            if (rel != null) {  
                isCssFile = rel.indexOf("stylesheet") != -1;  
            } else if (type != null) {  
                isCssFile |= type.indexOf("text/css") != -1;  
            }  
            // Handle external css file's url  
            if (isCssFile && href != null && href.length() > 0) {  
                String innerURL = href;  
                String absoluteURL = makeAbsoluteURL(strWeb, innerURL);  
                if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {  
                    urlMap.put(absoluteURL, innerURL);  
                    ArrayList urlInfo = new ArrayList();  
                    urlInfo.add(innerURL);  
                    urlInfo.add(absoluteURL);  
                    urlList.add(urlInfo);  
                }  
                tag.setAttribute("href", absoluteURL);  
            }  
        }  
 
        return urlList;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:抽取网页包含的图像链接 
     * 
     * 
     * 输入参数:nodes 网页标签集合; urlMap 已存在的url集合 
     * 
     * 
     * 返回类型:图像链接集合 
     * 
     */ 
    private ArrayList extractAllImageNodes(NodeList nodes, HashMap urlMap) {  
        ArrayList urlList = new ArrayList();  
        NodeList filtered = nodes.extractAllNodesThatMatch(new TagNameFilter(  
                "IMG"), true);  
        for (int i = 0; i < filtered.size(); i++) {  
            Tag tag = (Tag) filtered.elementAt(i);  
            String src = tag.getAttribute("src");  
            // Handle external css file's url  
            if (src != null && src.length() > 0) {  
                String innerURL = src;  
                String absoluteURL = makeAbsoluteURL(strWeb, innerURL);  
                if (absoluteURL != null && !urlMap.containsKey(absoluteURL)) {  
                    urlMap.put(absoluteURL, innerURL);  
                    ArrayList urlInfo = new ArrayList();  
                    urlInfo.add(innerURL);  
                    urlInfo.add(absoluteURL);  
                    urlList.add(urlInfo);  
                }  
                tag.setAttribute("src", absoluteURL);  
            }  
        }  
 
        return urlList;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:相对路径转绝对路径 
     * 
     * 
     * 输入参数:strWeb 网页地址; innerURL 相对路径链接 
     * 
     * 
     * 返回类型:绝对路径链接 
     * 
     */ 
    public static String makeAbsoluteURL(URL strWeb, String innerURL) {  
 
        // 去除后缀  
        int pos = innerURL.indexOf("?");  
        if (pos != -1) {  
            innerURL = innerURL.substring(0, pos);  
        }  
        if (innerURL != null && innerURL.toLowerCase().indexOf("http") == 0) {  
            System.out.println(innerURL);  
            return innerURL;  
        }  
 
        URL linkUri = null;  
        try {  
            linkUri = new URL(strWeb, innerURL);  
        } catch (MalformedURLException e) {  
 
            e.printStackTrace();  
            return null;  
        }  
 
        String absURL = linkUri.toString();  
        absURL = JHtmlClear.replace(absURL, "../", "");  
        absURL = JHtmlClear.replace(absURL, "./", "");  
        System.out.println(absURL);  
        return absURL;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:创建mht文件 
     * 
     * 
     * 输入参数:content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合 
     * 
     * 
     * 返回类型: 
     * 
     */ 
    private void createMhtArchive(String content, ArrayList urlScriptList,  
            ArrayList urlImageList) throws Exception {  
        MimeMultipart mp = new MimeMultipart("related");  
        Properties props = new Properties();  
        props.put("mail.smtp.host", smtp);  
        Session session = Session.getDefaultInstance(props, null);  
        MimeMessage msg = new MimeMessage(session);  
        msg.setHeader("X-Mailer", "Code Manager .SWT");  
        if (from != null) {  
            msg.setFrom(new InternetAddress(from));  
        }  
        if (subject != null) {  
            msg.setSubject(subject);  
        }  
        if (to != null) {  
            InternetAddress[] toAddresses = getInetAddresses(to);  
            msg.setRecipients(Message.RecipientType.TO, toAddresses);  
        }  
        if (cc != null) {  
            InternetAddress[] ccAddresses = getInetAddresses(cc);  
            msg.setRecipients(Message.RecipientType.CC, ccAddresses);  
        }  
        if (bcc != null) {  
            InternetAddress[] bccAddresses = getInetAddresses(bcc);  
            msg.setRecipients(Message.RecipientType.BCC, bccAddresses);  
        }  
        // 设置网页正文  
        MimeBodyPart bp = new MimeBodyPart();  
        bp.setText(content, strEncoding);  
        bp.addHeader("Content-Type", "text/html;charset=" + strEncoding);  
        bp.addHeader("Content-Location", strWeb.toString());  
        mp.addBodyPart(bp);  
        int urlCount = urlScriptList.size();  
        for (int i = 0; i < urlCount; i++) {  
            bp = new MimeBodyPart();  
            ArrayList urlInfo = (ArrayList) urlScriptList.get(i);  
            // String url = urlInfo.get(0).toString();  
            String absoluteURL = urlInfo.get(1).toString();  
            bp.addHeader("Content-Location", javax.mail.internet.MimeUtility  
                    .encodeWord(java.net.URLDecoder.decode(absoluteURL,  
                            strEncoding)));  
            DataSource source = new AttachmentDataSource(absoluteURL, "text");  
            bp.setDataHandler(new DataHandler(source));  
            mp.addBodyPart(bp);  
        }  
 
        urlCount = urlImageList.size();  
        for (int i = 0; i < urlCount; i++) {  
            bp = new MimeBodyPart();  
            ArrayList urlInfo = (ArrayList) urlImageList.get(i);  
            // String url = urlInfo.get(0).toString();  
            String absoluteURL = urlInfo.get(0).toString();  
            System.out.println(urlInfo.get(0).toString() + " +++ " + urlInfo.get(1));  
            bp.addHeader("Content-Location", javax.mail.internet.MimeUtility  
                    .encodeWord(java.net.URLDecoder.decode(absoluteURL,  
                            strEncoding)));  
            DataSource source = new AttachmentDataSource(absoluteURL, "image");  
            bp.setDataHandler(new DataHandler(source));  
            mp.addBodyPart(bp);  
        }  
        msg.setContent(mp);  
        // write the mime multi part message to a file  
        msg.writeTo(new FileOutputStream(strFileName));  
    }  
 
    /** 
     * 
     * 
     * 方法说明:mht转html 
     * 
     * 
     * 输入参数:strMht mht文件路径; strHtml html文件路径 
     * 
     * 
     * 返回类型: 
     * 
     */ 
    public static void mht2html(String strMht, String strHtml) {  
        try {  
 
            InputStream fis = new FileInputStream(strMht);  
            Session mailSession = Session.getDefaultInstance(System  
                    .getProperties(), null);  
            MimeMessage msg = new MimeMessage(mailSession, fis);  
            Object content = msg.getContent();  
            if (content instanceof Multipart) {  
                MimeMultipart mp = (MimeMultipart) content;  
                MimeBodyPart bp1 = (MimeBodyPart) mp.getBodyPart(0);  
                String strEncodng = getEncoding(bp1);  
                String strText = getHtmlText(bp1, strEncodng);  
                if (strText == null)  
                    return;  
                File parent = null;  
                if (mp.getCount() > 1) {  
                    parent = new File(new File(strHtml).getAbsolutePath() + ".files");  
                    parent.mkdirs();  
                    if (!parent.exists())  
                        return;  
                }  
                for (int i = 1; i < mp.getCount(); ++i) {  
                    MimeBodyPart bp = (MimeBodyPart) mp.getBodyPart(i);  
                    String strUrl = getResourcesUrl(bp);  
                    if (strUrl == null)  
                        continue;  
                    DataHandler dataHandler = bp.getDataHandler();  
                    MimePartDataSource source = (MimePartDataSource) dataHandler  
                            .getDataSource();  
                    File resources = new File(parent.getAbsolutePath()  
                            + File.separator + getName(strUrl, i));  
                    if (saveResourcesFile(resources, bp.getInputStream()))  
                        strText = JHtmlClear.replace(strText, strUrl, resources  
                                .getAbsolutePath());  
                }  
                saveHtml(strText, strHtml);  
            }  
        } catch (Exception e) {  
 
            e.printStackTrace();  
        }  
    }  
 
    /** 
     * 
     * 
     * 方法说明:得到资源文件的name 
     * 
     * 
     * 输入参数:strName 资源文件链接, ID 资源文件的序号 
     * 
     * 
     * 返回类型:资源文件的本地临时文件名 
     * 
     */ 
    public static String getName(String strName, int ID) {  
        char separator = '/';  
        System.out.println(strName);  
        System.out.println(separator);  
        if (strName.lastIndexOf(separator) >= 0)  
            return format(strName.substring(strName.lastIndexOf(separator) + 1));  
        return "temp" + ID;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:得到网页编码 
     * 
     * 
     * 输入参数:bp MimeBodyPart类型的网页内容 
     * 
     * 
     * 返回类型:MimeBodyPart里的网页内容的编码 
     * 
     */ 
    private static String getEncoding(MimeBodyPart bp) {  
        if (bp != null) {  
            try {  
                Enumeration list = bp.getAllHeaders();  
                while (list.hasMoreElements()) {  
                    javax.mail.Header head = (javax.mail.Header) list  
                            .nextElement();  
                    if (head.getName().compareTo("Content-Type") == 0) {  
                        String strType = head.getValue();  
                        int pos = strType.indexOf("charset=");  
                        if (pos != -1) {  
                            String strEncoding = strType.substring(pos + 8, strType.length());  
                            if (strEncoding.toLowerCase().compareTo("gb2312") == 0) {  
                                strEncoding = "gbk";  
                            }  
                            return strEncoding;  
                        }  
                    }  
                }  
            } catch (MessagingException e) {  
                e.printStackTrace();  
            }  
        }  
        return null;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:得到资源文件url 
     * 
     * 
     * 输入参数:bp MimeBodyPart类型的网页内容 
     * 
     * 
     * 返回类型:资源文件url 
     * 
     */ 
    private static String getResourcesUrl(MimeBodyPart bp) {  
        if (bp != null) {  
            try {  
                Enumeration list = bp.getAllHeaders();  
                while (list.hasMoreElements()) {  
                    javax.mail.Header head = (javax.mail.Header) list  
                            .nextElement();  
                    if (head.getName().compareTo("Content-Location") == 0) {  
                        return head.getValue();  
                    }  
                }  
            } catch (MessagingException e) {  
 
                e.printStackTrace();  
            }  
 
        }  
        return null;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:格式化文件名 
     * 
     * 
     * 输入参数:strName 文件名 
     * 
     * 
     * 返回类型:经过处理的符合命名规则的文件名 
     * 
     */ 
    private static String format(String strName) {  
        if (strName == null)  
            return null;  
        strName = strName.replaceAll("     ", " ");  
        String strText = "\\/:*?\"<>|^___FCKpd___0quot;";  
        for (int i = 0; i < strName.length(); ++i) {  
            String ch = String.valueOf(strName.charAt(i));  
            if (strText.indexOf(ch) != -1) {  
                strName = strName.replace(strName.charAt(i), '-');  
            }  
        }  
        return strName;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:保存资源文件 
     * 
     * 
     * 输入参数:resources 要创建的资源文件; inputStream 要输入文件中的流 
     * 
     * 
     * 返回类型:boolean 
     * 
     */ 
    private static boolean saveResourcesFile(File resources,  
            InputStream inputStream) {  
        if (resources == null || inputStream == null) {  
            return false;  
        }  
        BufferedInputStream in = null;  
        FileOutputStream fio = null;  
        BufferedOutputStream osw = null;  
        try {  
            in = new BufferedInputStream(inputStream);  
            fio = new FileOutputStream(resources);  
            osw = new BufferedOutputStream(new DataOutputStream(fio));  
            int b;  
            byte[] a = new byte[1024];  
            boolean isEmpty = true;  
            while ((b = in.read(a)) != -1) {  
                isEmpty = false;  
                osw.write(a, 0, b);  
                osw.flush();  
            }  
            osw.close();  
            fio.close();  
            in.close();  
            inputStream.close();  
            if (isEmpty)  
                resources.delete();  
            return true;  
        } catch (Exception e) {  
 
            e.printStackTrace();  
            System.out.println("解析mht失败");  
            return false;  
        } finally {  
            try {  
                if (osw != null)  
                    osw.close();  
                if (fio != null)  
                    fio.close();  
                if (in != null)  
                    in.close();  
                if (inputStream != null)  
                    inputStream.close();  
            } catch (Exception e) {  
                e.printStackTrace();  
                System.out.println("解析mht失败");  
                return false;  
            }  
        }  
    }  
 
    /** 
     * 
     * 
     * 方法说明:得到mht文件的标题 
     * 
     * 
     * 输入参数:mhtFilename mht文件名 
     * 
     * 
     * 返回类型:mht文件的标题 
     * 
     */ 
    public static String getTitle(String mhtFilename) {  
        try {  
 
            InputStream fis = new FileInputStream(mhtFilename);  
            Session mailSession = Session.getDefaultInstance(System  
                    .getProperties(), null);  
            MimeMessage msg = new MimeMessage(mailSession, fis);  
            Object content = msg.getContent();  
            if (content instanceof Multipart) {  
                MimeMultipart mp = (MimeMultipart) content;  
                MimeBodyPart bp1 = (MimeBodyPart) mp.getBodyPart(0);  
                String strEncodng = getEncoding(bp1);  
                String strText = getHtmlText(bp1, strEncodng);  
                if (strText == null)  
                    return null;  
                strText = strText.toLowerCase();  
                int pos1 = strText.indexOf("<title>");  
                int pos2 = strText.indexOf("</title>");  
                if (pos1 != -1 && pos2 != -1 && pos2 > pos1) {  
                    return strText.substring(pos1 + 7, pos2).trim();  
                }  
            }  
            return null;  
        } catch (Exception e) {  
 
            e.printStackTrace();  
            return null;  
        }  
    }  
 
    /** 
     * 
     * 
     * 方法说明:得到html文本 
     * 
     * 
     * 输入参数:bp MimeBodyPart类型的网页内容; strEncoding 内容编码 
     * 
     * 
     * 返回类型:html文本 
     * 
     */ 
    private static String getHtmlText(MimeBodyPart bp, String strEncoding) {  
        InputStream textStream = null;  
        BufferedInputStream buff = null;  
        BufferedReader br = null;  
        Reader r = null;  
        try {  
            textStream = bp.getInputStream();  
            buff = new BufferedInputStream(textStream);  
            r = new InputStreamReader(buff, strEncoding);  
            br = new BufferedReader(r);  
            StringBuffer strHtml = new StringBuffer("");  
            String strLine = null;  
            while ((strLine = br.readLine()) != null) {  
                strHtml.append(strLine + "\r\n");  
            }  
            br.close();  
            r.close();  
            textStream.close();  
            return strHtml.toString();  
        } catch (Exception e) {  
 
            e.printStackTrace();  
        } finally {  
            try {  
                if (br != null)  
                    br.close();  
                if (buff != null)  
                    buff.close();  
                if (textStream != null)  
                    textStream.close();  
            } catch (Exception e) {  
                System.out.println("解析mht失败");  
            }  
        }  
        return null;  
    }  
 
    /** 
     * 
     * 
     * 方法说明:保存html文件 
     * 
     * 
     * 输入参数:strText html内容; strHtml html文件名 
     * 
     * 
     * 返回类型: 
     * 
     */ 
    private static void saveHtml(String strText, String strHtml) {  
        try {  
            FileWriter fw = new FileWriter(strHtml);  
            fw.write(strText);  
            fw.close();  
        } catch (IOException e) {  
            e.printStackTrace();  
            System.out.println("解析mht失败");  
        }  
    }  
 
    private InternetAddress[] getInetAddresses(String emails) throws Exception {  
        ArrayList list = new ArrayList();  
        StringTokenizer tok = new StringTokenizer(emails, ",");  
        while (tok.hasMoreTokens()) {  
            list.add(tok.nextToken());  
        }  
        int count = list.size();  
        InternetAddress[] addresses = new InternetAddress[count];  
        for (int i = 0; i < count; i++) {  
            addresses[i] = new InternetAddress(list.get(i).toString());  
        }  
        return addresses;  
    }  
 
    class AttachmentDataSource implements DataSource {  
        private MimetypesFileTypeMap map = new MimetypesFileTypeMap();  
        private String strUrl;  
        private String strType;  
        private byte[] dataSize = null;  
 
        /** 
         * 
         * This is some content type maps. 
         * 
         */ 
        private Map normalMap = new HashMap();  
        {  
            // Initiate normal mime type map  
            // Images  
            normalMap.put("image", "image/jpeg");  
            normalMap.put("text", "text/plain");  
        }  
 
        public AttachmentDataSource(String strUrl, String strType) {  
            this.strType = strType;  
            this.strUrl = strUrl;  
 
            strUrl = strUrl.trim();  
            strUrl = strUrl.replaceAll(" ", "%20");  
            dataSize = JQuery.downBinaryFile(strUrl);  
        }  
 
        /** 
         * 
         * Returns the content type. 
         * 
         */ 
        public String getContentType() {  
            return getMimeType(getName());  
        }  
 
        public String getName() {  
            char separator = File.separatorChar;  
            if (strUrl.lastIndexOf(separator) >= 0)  
                return strUrl.substring(strUrl.lastIndexOf(separator) + 1);  
            return strUrl;  
        }  
 
        private String getMimeType(String fileName) {  
            String type = (String) normalMap.get(strType);  
            if (type == null) {  
                try {  
                    type = map.getContentType(fileName);  
                } catch (Exception e) {  
 
                }  
                System.out.println(type);  
                // Fix the null exception  
                if (type == null) {  
                    type = "application/octet-stream";  
                }  
            }  
 
            return type;  
        }  
 
        public InputStream getInputStream() throws IOException {  
 
            if (dataSize == null)  
                dataSize = new byte[0];  
            return new ByteArrayInputStream(dataSize);  
        }  
 
        public OutputStream getOutputStream() throws IOException {  
 
            return new java.io.ByteArrayOutputStream();  
        }  
 
    }  
}  
 
class JHtmlClear {  
    public static String replace(String s, String s1, String s2) {  
        return s.replace(s1, s2);  
    }  
}  
 
class JQuery {  
    public static String getHtmlText(String strUrl, String strEncoding) {  
        try {  
            URL url = new URL(strUrl);  
            HttpURLConnection connection = (HttpURLConnection) url  
                    .openConnection();  
            DataInputStream in = new DataInputStream(connection.getInputStream());  
            return new String(JQuery.getBytes(in), strEncoding);  
        } catch (Exception e) {  
            return "";  
        }  
    }  
 
    public static byte[] downBinaryFile(String s) {  
        try {  
            URL url = new URL(s);  
            HttpURLConnection connection = (HttpURLConnection) url  
                    .openConnection();  
            DataInputStream in = new DataInputStream(connection  
                    .getInputStream());  
            return JQuery.getBytes(in);  
        } catch (Exception e) {  
            return null;  
        }  
    }  
 
    public static byte[] getBytes(InputStream is) throws Exception {  
        byte[] data = null;  
        Collection chunks = new ArrayList();  
        byte[] buffer = new byte[1024 * 1000];  
        int read = -1;  
        int size = 0;  
        while ((read = is.read(buffer)) != -1) {  
            if (read > 0) {  
                byte[] chunk = new byte[read];  
                System.arraycopy(buffer, 0, chunk, 0, read);  
                chunks.add(chunk);  
                size += chunk.length;  
            }  
        }  
        if (size > 0) {  
            ByteArrayOutputStream bos = null;  
            try {  
                bos = new ByteArrayOutputStream(size);  
                for (Iterator itr = chunks.iterator(); itr.hasNext();) {  
                    byte[] chunk = (byte[]) itr.next();  
                    bos.write(chunk);  
                }  
                data = bos.toByteArray();  
            } finally {  
                if (bos != null) {  
                    bos.close();  
                }  
            }  
        }  
        return data;  
    }  

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值