这周接的一个很坑的需求,平时邮件解析都是标准eml格式邮件,第一次解析html格式,网上没找到解决办法,只有自己写一个按标签来匹配内容,代码如下
package com.email_monitor.until;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.;
import java.text.ParseException;
import java.util.;
public class PraseMimeMessage {
/**
* 读取文件内容
*
* @param fileName String 如 c:\1.txt 绝对路径
* @return boolean
*/
public static String readFile(String fileName, String charset) {
int index = charset.indexOf(“charset=”);
String fileContent = “”;
try {
File f = new File(fileName);
if (f.isFile() && f.exists()) {
InputStreamReader read = new InputStreamReader(new FileInputStream(f), charset.substring(index + 8).replaceAll(" ", ""));
BufferedReader reader = new BufferedReader(read);
String line;
while ((line = reader.readLine()) != null) {
fileContent += line;
}
read.close();
}
} catch (Exception e) {
e.printStackTrace();
}
return fileContent;
}
/**
* @param html 解析qq邮件html
* @return
*/
public static Map<Object, Object> readQQEmailHtml(String html) {
Map<Object, Object> map = new HashMap<>();
Document doc = Jsoup.parseBodyFragment(html);
//发件人
String Efrom = "";
for (Element element : doc.select("div[class=tcolor qm_left txtflow]").select("b[class=grn]")) {
Efrom = element.text();
break;
}
Elements elements = doc.select("div[class=clear]").select("div[class=qm_left txtflow graytext]");
for (Element element : elements) {
Efrom = Efrom + "<" + element.text() + ">";
map.put("fjrAddress", Efrom);
break;
}
//收件人,抄送人,是否有附件
Elements elements1 = doc.select("td[class=settingtable txt_left]").select("div[class=addrtitle nowrap]");
for (Element element : elements1) {
if (element.text().contains("收件人")) {
element = element.nextElementSibling();
if (map.containsKey("sjrAddressList") && map.containsKey("sjrAddressList")) {
map.put("sjrAddressList", map.get("sjrAddressList") + element.text());
} else {
map.put("sjrAddressList", element.text());
}
} else if (element.text().contains("抄 送")) {
element = element.nextElementSibling();
if (map.containsKey("ccMailAddr") && map.containsKey("ccMailAddr")) {
map.put("ccMailAddr", map.get("ccMailAddr") + element.text());
} else {
map.put("ccMailAddr", element.text());
}
} else if (element.text().contains("附 件")) {
map.put("existFile", true);
}
}
//发送时间
Elements elements2 = doc.select("td[class=settingtable txt_left]").select("span[class=addrtitle]");
try {
for (Element element : elements2) {
if (element.text().contains("时 间")) {
element = element.nextElementSibling();
map.put("sendTime",PraseMimeMessage.switchDate(element.text()));
}
}
} catch (Exception e) {
e.printStackTrace();
}
//正文
Elements elements4 = doc.select("div[id=contentDiv]");
StringBuilder content = new StringBuilder();
for (Element element : elements4) {
content.append(element.text() + " ");
//解析正文中的图片
Elements element11 = element.getElementsByTag("img");
for(Element element2 : element11) {
String imgSrc=element2.attr("src"); //获取src属性的值
content.append(imgSrc + " ");
}
}
map.put("content",content.toString());
Elements elements3 = doc.select("div[class=qm_left]").select("span[class=sub_title]");
for (Element element : elements3) {
map.put("subject",element.text());
}
return map;
}
public static String switchDate(String date) throws ParseException {
String date3 = date.substring(0, date.indexOf("(")) + date.substring(date.indexOf(")") + 1);
if (date3.contains("上午")) {
//10:26
StringBuffer str = new StringBuffer();
for (int i = 0; i < date3.length(); i++) {
if (date3.charAt(i) != '上' && date3.charAt(i) != '午') {
str.append(date3.charAt(i));
}
}
return str.toString();
} else if(date3.contains("下午")){
Integer time = Integer.parseInt(PraseMimeMessage.subjectStr(date3, "下午").substring(0, PraseMimeMessage.subjectStr(date3, "下午").indexOf(":"))) + 12;
Integer time1 = Integer.parseInt(PraseMimeMessage.subjectStr(date3, "下午").substring(PraseMimeMessage.subjectStr(date3, "下午").indexOf(":") + 1));
date.substring(0, date.indexOf("(")).substring(0, date.indexOf("("));
String datee = date.substring(0, date.indexOf("(")) + " " + time.toString() + ":" + time1.toString();
return datee;
} else if(date3.contains("中午")){
//10:26
StringBuffer str = new StringBuffer();
for (int i = 0; i < date3.length(); i++) {
if (date3.charAt(i) != '中' && date3.charAt(i) != '午') {
str.append(date3.charAt(i));
}
}
return str.toString();
}else if(date3.contains("晚上")){
Integer time = Integer.parseInt(PraseMimeMessage.subjectStr(date3, "晚上").substring(0, PraseMimeMessage.subjectStr(date3, "晚上").indexOf(":"))) + 12;
Integer time1 = Integer.parseInt(PraseMimeMessage.subjectStr(date3, "晚上").substring(PraseMimeMessage.subjectStr(date3, "晚上").indexOf(":") + 1));
date.substring(0, date.indexOf("(")).substring(0, date.indexOf("("));
String datee = date.substring(0, date.indexOf("(")) + " " + time.toString() + ":" + time1.toString();
return datee;
}
return null;
}
public static String subjectStr(String str, String subStr) {
return str.substring(str.indexOf(subStr) + subStr.length());
}
}