1. jsoup 解析功能, 把 HTML 分解成最小的node, 类似<p>, <u>
String newHtml = sourceHtml.replaceAll("\u00a0", " ").replaceAll(" ", " ").replaceAll("\r", "").replaceAll("\t", "");
Whitelist whitelist = new Whitelist();
whitelist.addTags("strong","em","u","li","p","ol","ul");
String value = Jsoup.clean(newHtml, whitelist);
Document doc = Jsoup.parse(value);
List<Node>nodes= doc.getElementsByTag("body").get(0).childNodes();
2. 运用递归算法 遍历所有node, 获得其文本 和font style list.
package com.util;import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.hssf.usermodel.HSSFFont;
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFFont;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.safety.Whitelist;
import com.model.CellApplyFont;
public class ExcelFormatReplace {
/**
* @param wb
* @param sourceHtml
* @return
*/
// public static void main(String []args){
// String html ="<p><span>ab</span><strong><em><span>cdefgHI</span></em>" +
// "</strong><u><em><span>jkl</span></em></u></p><p><u><em><strong>" +
// "<span>sad</span></strong></em></u><em><strong><span>892eiod</span></strong></em><u><em><span>a</span> <span>xc</span> <span>d</span> </em></u></p><ul><li><span>12</span><u><em><span>3</span><strong><span>4</span></strong><span>WA</span></em></u><em><span>1a</span></em><u><em><span>sd</span></em></u></li></ul><ol><li><u><em><span>zca<strong>sd</strong>a</span></em></u></li><li><strong><span>as</span></strong><u><em><span>dzAzxd</span></em></u></li></ol><p><u><em><span>add</span><strong><span>as</span></strong><span>d</span></em></u></p><ul><li><u><em><span>d</span></em><span>asdzxc</span></u></li><li><u><em><span>a</span><strong><span>sdzxc</span></strong></em></u></li></ul>";
//
//
//
// HSSFWorkbook wb = new HSSFWorkbook();
//
//
// HSSFSheet sheet = wb.createSheet("test");
// HSSFRow row = sheet.createRow(0);
//
// HSSFRichTextString string = createRichTextString(wb,html.replace(" ", " "));
//
// HSSFCellStyle cellStyle= wb.createCellStyle();
// cellStyle.setWrapText(true);
cellStyle.setAlignment(HSSFCellStyle.VERTICAL_TOP);
// HSSFCell cellNew = row.createCell(0);
// cellNew.setCellValue(string);
// cellNew.setCellStyle(cellStyle);
// FileOutputStream fout;
// try {
// fout = new FileOutputStream("C:/project/test.xls");
// wb.write(fout);
// fout.close();
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// }
//this is for 2003
public HSSFRichTextString createRichTextString(String sourceHtml,List<HSSFFont> hssffont){
//for replace two kinds of | \u00a0 is translated
String newHtml = sourceHtml.replaceAll("\u00a0", " ").replaceAll(" ", " ").replaceAll("\r", "").replaceAll("\t", "");
Whitelist whitelist = new Whitelist();
whitelist.addTags("strong","em","u","li","p","ol","ul");
String value = Jsoup.clean(newHtml, whitelist);
Document doc = Jsoup.parse(value);
Map<String, Object> indexMap=new HashMap<String, Object>();
List<CellApplyFont> applyFontList=new ArrayList<CellApplyFont>();
StringBuffer textString=new StringBuffer();
int startIndex=0;
indexMap.put("startIndex", String.valueOf(startIndex));
List<Node>nodes= doc.getElementsByTag("body").get(0).childNodes();
// //list for font
// List<HSSFFont> hssffont = new ArrayList<HSSFFont>();
// hssffont = getHSSFWorkbook(wb);
//
//
for(Node node:nodes){
createFormatString(indexMap,applyFontList,node,textString,hssffont);
}
HSSFRichTextString testFormat = new HSSFRichTextString(textString.toString());
for(int i=0;i<applyFontList.size();i++){
CellApplyFont cellFont =applyFontList.get(i);
testFormat.applyFont(cellFont.getStartIndex(),cellFont.getEndIndex(),cellFont.getFont());
}
return testFormat;
}
//this is for 2007
public XSSFRichTextString createRichTextString2007(String sourceHtml,List<XSSFFont> hssffont){
Calendar start = new GregorianCalendar();
//for replace two kinds of | \u00a0 is translated
String newHtml = sourceHtml.replaceAll("\u00a0", " ").replaceAll(" ", " ").replaceAll("\r", "").replaceAll("\t", "");
Whitelist whitelist = new Whitelist();
whitelist.addTags("strong","em","u","li","p","ol","ul");
String value = Jsoup.clean(newHtml, whitelist);
Document doc = Jsoup.parse(value);
Map<String, Object> indexMap=new HashMap<String, Object>();
List<CellApplyFont> applyFontList=new ArrayList<CellApplyFont>();
StringBuffer textString=new StringBuffer();
int startIndex=0;
indexMap.put("startIndex", String.valueOf(startIndex));
List<Node>nodes= doc.getElementsByTag("body").get(0).childNodes();
// //list for font
// List<HSSFFont> hssffont = new ArrayList<HSSFFont>();
// hssffont = getHSSFWorkbook(wb);
//
//
for(Node node:nodes){
createFormatString2007(indexMap,applyFontList,node,textString,hssffont);
}
XSSFRichTextString testFormat = new XSSFRichTextString(textString.toString());
for(int i=0;i<applyFontList.size();i++){
CellApplyFont cellFont =applyFontList.get(i);
testFormat.applyFont(cellFont.getStartIndex(),cellFont.getEndIndex(),cellFont.getFont2007());
}
Calendar end = new GregorianCalendar();
System.out.println("***********************set on cell cost ************"+(end.getTimeInMillis()-start.getTimeInMillis()));
return testFormat;
}
//this is for 2003
/**
* @param wb
* @param startIndexMap
* @param applyFontList
* @param node
* @param textString
*/
public void createFormatString(Map<String, Object> startIndexMap, List<CellApplyFont> applyFontList, Node node,StringBuffer textString,List<HSSFFont> hssffont){
int startIndex=Integer.parseInt((String)startIndexMap.get("startIndex"));
if("#text".equals(node.nodeName())){
StringBuffer nodeTextString=new StringBuffer();
getApplyFont(node,node,startIndexMap,hssffont);
if(null == startIndexMap.get("nodeTextString")){
nodeTextString.append(node.toString().trim());
}
else{
nodeTextString.append(startIndexMap.get("nodeTextString"));
}
CellApplyFont cellfont=new CellApplyFont();
if(nodeTextString.toString().indexOf("\n\u2022 ")==0)
cellfont.setStartIndex(startIndex+2);
else
cellfont.setStartIndex(startIndex);
cellfont.setEndIndex(startIndex+nodeTextString.toString().length());
cellfont.setFont((HSSFFont) startIndexMap.get("font"));
applyFontList.add(cellfont);
//clear object except start index
startIndexMap.remove("font");
startIndexMap.remove("nodeTextString");
startIndex=startIndex+nodeTextString.toString().length();
startIndexMap.put("startIndex", String.valueOf(startIndex));
textString.append(nodeTextString.toString());
}
else{
for(Node childNode:node.childNodes()){
createFormatString(startIndexMap, applyFontList, childNode,textString,hssffont);
}
}
}
//this is for 2007
/**
* @param wb
* @param startIndexMap
* @param applyFontList
* @param node
* @param textString
*/
public void createFormatString2007(Map<String, Object> startIndexMap, List<CellApplyFont> applyFontList, Node node,StringBuffer textString,List<XSSFFont> hssffont){
int startIndex=Integer.parseInt((String)startIndexMap.get("startIndex"));
if("#text".equals(node.nodeName())){
StringBuffer nodeTextString=new StringBuffer();
getApplyFont2007(node,node,startIndexMap,hssffont);
if(null == startIndexMap.get("nodeTextString")){
nodeTextString.append(node.toString().trim());
}
else{
nodeTextString.append(startIndexMap.get("nodeTextString"));
}
CellApplyFont cellfont=new CellApplyFont();
if(nodeTextString.toString().indexOf("\n\u2022 ")==0)
cellfont.setStartIndex(startIndex+2);
else
cellfont.setStartIndex(startIndex);
cellfont.setEndIndex(startIndex+nodeTextString.toString().length());
cellfont.setFont2007((XSSFFont) startIndexMap.get("font"));
applyFontList.add(cellfont);
//clear object except start index
startIndexMap.remove("font");
startIndexMap.remove("nodeTextString");
startIndex=startIndex+nodeTextString.toString().length();
startIndexMap.put("startIndex", String.valueOf(startIndex));
textString.append(nodeTextString.toString());
}
else{
for(Node childNode:node.childNodes()){
createFormatString2007(startIndexMap, applyFontList, childNode,textString,hssffont);
}
}
}
//this is for 2003
/**
* @param node
* @param textNode
* @param indexStringMap get text node font through the all parent nodes
*/
private void getApplyFont(Node node,Node textNode,Map<String, Object> indexStringMap,List<HSSFFont> hssffont){
if("body".equals(node.parent().nodeName())){
StringBuffer font = (StringBuffer) indexStringMap.get("font");
indexStringMap.put("font", getHSSFFont(font, hssffont));
return;
}else{
StringBuffer nodeTextString=new StringBuffer();
if(!"".equals(indexStringMap.get("nodeTextString"))&&null!=indexStringMap.get("nodeTextString")){
nodeTextString.append(indexStringMap.get("nodeTextString"));
}
else
nodeTextString.append(node.outerHtml().trim());
//HSSFFont font= hssffont.get(0);
StringBuffer font = new StringBuffer();
//HSSFFont fontNew = new HSSFFont((short) 0, null);
if(null!=indexStringMap.get("font")){
font = (StringBuffer) indexStringMap.get("font");
}
if("p".equals(node.parent().nodeName().toString())&& !node.ownerDocument().getElementsByTag("body").get(0).childNode(0).equals(node.parent())){
if(getNodeText(node.parent(),new StringBuffer()).indexOf(getNodeText(textNode,new StringBuffer()))==0){
nodeTextString.insert(0,"\n");
}
}else if("em".equals(node.parent().nodeName().toString())){
font.append("em");
//font.setItalic(true);
}else if("strong".equals(node.parent().nodeName().toString())){
font.append("strong");
//font.setBoldweight(Font.BOLDWEIGHT_BOLD);
}else if("u".equals(node.parent().nodeName().toString())){
font.append("u");
//font.setUnderline(Font.U_SINGLE);
}else if("li".equals(node.parent().nodeName().toString())){
//whether this is first text node in li node. if yes, append "\n", or else ignore it.
if(getNodeText(node.parent(),new StringBuffer()).indexOf(getNodeText(textNode,new StringBuffer()))==0){
nodeTextString.insert(0,"\n\u2022 ");
}
}
/* String str = fontValue.toString();
if(str.contains("em")){
font = hssffont.get(1);
if(str.contains("strong")){
font = hssffont.get(4);
if(str.contains("u")){
font = hssffont.get(7);
}
}
if(str.contains("u")){
font = hssffont.get(5);
}
}
else if(str.contains("strong")){
font = hssffont.get(2);
if(str.contains("u")){
font = hssffont.get(6);
}
}
else if(str.contains("u")){
font = hssffont.get(3);
}
*/
indexStringMap.put("nodeTextString", nodeTextString);
indexStringMap.put("font", font);
getApplyFont(node.parent(),textNode,indexStringMap,hssffont);
}
}
//this is for 2007
/**
* @param node
* @param textNode
* @param indexStringMap get text node font through the all parent nodes
*/
private void getApplyFont2007(Node node,Node textNode,Map<String, Object> indexStringMap,List<XSSFFont> hssffont){
if("body".equals(node.parent().nodeName())){
StringBuffer font = (StringBuffer) indexStringMap.get("font");
indexStringMap.put("font", getHSSFFont2007(font, hssffont));
return;
}else{
StringBuffer nodeTextString=new StringBuffer();
if(!"".equals(indexStringMap.get("nodeTextString"))&&null!=indexStringMap.get("nodeTextString")){
nodeTextString.append(indexStringMap.get("nodeTextString"));
}
else
nodeTextString.append(node.outerHtml().trim());
//HSSFFont font= hssffont.get(0);
StringBuffer font = new StringBuffer();
//HSSFFont fontNew = new HSSFFont((short) 0, null);
if(null!=indexStringMap.get("font")){
font = (StringBuffer) indexStringMap.get("font");
}
if("p".equals(node.parent().nodeName().toString())&& !node.ownerDocument().getElementsByTag("body").get(0).childNode(0).equals(node.parent())){
if(getNodeText(node.parent(),new StringBuffer()).indexOf(getNodeText(textNode,new StringBuffer()))==0){
nodeTextString.insert(0,"\n");
}
}else if("em".equals(node.parent().nodeName().toString())){
font.append("em");
//font.setItalic(true);
}else if("strong".equals(node.parent().nodeName().toString())){
font.append("strong");
//font.setBoldweight(Font.BOLDWEIGHT_BOLD);
}else if("u".equals(node.parent().nodeName().toString())){
font.append("u");
//font.setUnderline(Font.U_SINGLE);
}else if("li".equals(node.parent().nodeName().toString())){
//whether this is first text node in li node. if yes, append "\n", or else ignore it.
if(getNodeText(node.parent(),new StringBuffer()).indexOf(getNodeText(textNode,new StringBuffer()))==0){
int indexLevel=0;
indexLevel=getSubBulletLevel(node,0);
StringBuffer space=new StringBuffer();
for(int i=0;i<=indexLevel;i++)
{
space.append(" ");
}
nodeTextString.insert(0,"\n"+space.toString()+"\u2022 ");
}
}
/* String str = fontValue.toString();
if(str.contains("em")){
font = hssffont.get(1);
if(str.contains("strong")){
font = hssffont.get(4);
if(str.contains("u")){
font = hssffont.get(7);
}
}
if(str.contains("u")){
font = hssffont.get(5);
}
}
else if(str.contains("strong")){
font = hssffont.get(2);
if(str.contains("u")){
font = hssffont.get(6);
}
}
else if(str.contains("u")){
font = hssffont.get(3);
}
*/
//if(!"\n".equals(nodeTextString.toString())){
indexStringMap.put("nodeTextString", nodeTextString);
indexStringMap.put("font", font);
//}
getApplyFont2007(node.parent(),textNode,indexStringMap,hssffont);
}
}
//get sub bullet level
public int getSubBulletLevel(Node node, int level){
if(node.parent().nodeName().equals("body")){
return level;
}else if(node.nodeName().equals("li")){
return getSubBulletLevel(node.parent(),level+1);
}else
return getSubBulletLevel(node.parent(),level);
}
/**
* @param node
* @param text
* @return node all text
*/
public String getNodeText(Node node,StringBuffer text){
if(node instanceof TextNode){
text.append(((TextNode) node).text());
}else {
for (Node childNode:node.childNodes()){
getNodeText(childNode,text);
}
}
return text.toString();
}
/* public static HSSFFont getHSSFFont(HSSFFont original,List<HSSFFont> hssffont){
HSSFFont hssffontNew= hssffont.get(0);
if(true == original.getItalic()){
hssffontNew = hssffont.get(1);
if(Font.BOLDWEIGHT_BOLD == original.getBoldweight()){
hssffontNew = hssffont.get(4);
if(Font.U_SINGLE == original.getUnderline()){
hssffontNew = hssffont.get(7);
}
}
if(Font.U_SINGLE == original.getUnderline()){
hssffontNew = hssffont.get(5);
}
}
else if(Font.BOLDWEIGHT_BOLD == original.getBoldweight()){
hssffontNew = hssffont.get(2);
if(Font.U_SINGLE == original.getUnderline()){
hssffontNew = hssffont.get(6);
}
}
else if(Font.U_SINGLE == original.getUnderline()){
hssffontNew = hssffont.get(3);
}
return hssffontNew;
}*/
//this is for 2003
public HSSFFont getHSSFFont(StringBuffer original,List<HSSFFont> hssffont){
HSSFFont hssffontNew= hssffont.get(0);
if(null == original){
original = new StringBuffer("");
}
String str = original.toString();
if(str.contains("em")){
hssffontNew = hssffont.get(1);
if(str.contains("strong")){
hssffontNew = hssffont.get(4);
if(str.contains("u")){
hssffontNew = hssffont.get(7);
return hssffontNew;
}
}
if(str.contains("u")){
hssffontNew = hssffont.get(5);
}
}
else if(str.contains("strong")){
hssffontNew = hssffont.get(2);
if(str.contains("u")){
hssffontNew = hssffont.get(6);
}
}
else if(str.contains("u")){
hssffontNew = hssffont.get(3);
}
return hssffontNew;
}
//this is for 2007
public XSSFFont getHSSFFont2007(StringBuffer original,List<XSSFFont> hssffont){
XSSFFont hssffontNew= hssffont.get(0);
if(null == original){
original = new StringBuffer("");
}
String str = original.toString();
if(str.contains("em")){
hssffontNew = hssffont.get(1);
if(str.contains("strong")){
hssffontNew = hssffont.get(4);
if(str.contains("u")){
hssffontNew = hssffont.get(7);
return hssffontNew;
}
}
if(str.contains("u")){
hssffontNew = hssffont.get(5);
}
}
else if(str.contains("strong")){
hssffontNew = hssffont.get(2);
if(str.contains("u")){
hssffontNew = hssffont.get(6);
}
}
else if(str.contains("u")){
hssffontNew = hssffont.get(3);
}
return hssffontNew;
}
/* public List<HSSFFont> getHSSFWorkbook(HSSFWorkbook wb){
HSSFFont font= wb.createFont();
HSSFFont font1= wb.createFont();
HSSFFont font2= wb.createFont();
HSSFFont font3= wb.createFont();
HSSFFont font4= wb.createFont();
HSSFFont font5= wb.createFont();
HSSFFont font6= wb.createFont();
HSSFFont font7= wb.createFont();
List<HSSFFont> hssffontList = new ArrayList<HSSFFont>();
hssffontList.add(font);
font1.setItalic(true);
hssffontList.add(font1);
font2.setBoldweight(Font.BOLDWEIGHT_BOLD);
hssffontList.add(font2);
font3.setUnderline(Font.U_SINGLE);
hssffontList.add(font3);
font4.setItalic(true);
font4.setBoldweight(Font.BOLDWEIGHT_BOLD);
hssffontList.add(font4);
font5.setItalic(true);
font5.setUnderline(Font.U_SINGLE);
hssffontList.add(font5);
font6.setBoldweight(Font.BOLDWEIGHT_BOLD);
font6.setUnderline(Font.U_SINGLE);
hssffontList.add(font6);
font7.setBoldweight(Font.BOLDWEIGHT_BOLD);
font7.setUnderline(Font.U_SINGLE);
font7.setItalic(true);
hssffontList.add(font7);
return hssffontList;
}*/
}
3. set richtext to excel cell value.