最近在做一个项目,要求解析html报告里的数据,报告的规格也都不一样。就写了个简单的工具类
用到的技术是结合了 jsoup、sipsoup包的使用
maven地址:
<!-- jsoup Xpath 解析html -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>com.virjar</groupId>
<artifactId>sipsoup</artifactId>
<version>RELEASE</version>
</dependency>
import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.virjar.sipsoup.exception.XpathSyntaxErrorException; import com.virjar.sipsoup.parse.XpathParser; /** * @Title: ParseHtmlUtil.java * @Package com.bluedon.track.util * @Description: 解析html漏洞数据工具类 * @author * @date 2017年7月12日 下午3:32:56 * @version V1.0 */ public class ParseHtmlUtil { private final static Logger log = Logger.getLogger(ParseHtmlUtil.class); /** * @Title: paserHtml * @Description: 解析html漏洞数据 * @param filePath * 解析的html地址 * @param rootName * 解析的根节点名称 * @param map * key->名称,value->解析地址规则 * @return List<Map<String,Object>> * @throws IOException */ public static List<Map<String, Object>> paserHtml(File file, String rootName, Map<String, Object> map) throws IOException { List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(); Document doc = Jsoup.parse(file, "UTF-8"); String rootPath = map.get(rootName).toString(); map.remove(rootName); try { List<Element> eles=XpathParser.compile(rootPath).evaluateToElement(doc); if(null!=eles&&eles.size()>0){ for (Element node : eles) { if (null != map || map.size() >0) { Set<String> keySet = map.keySet(); Map<String,Object> maps=new HashMap<String,Object>(); /*此情况是ips有多条数据的*/ boolean manyIps=false; Set<String> manyNames=new HashSet<String>(); for (String key : keySet) { String path=map.get(key).toString(); /*过滤 路径为空的,仅仅是字母的 */ if(path.equals("")||checkRegex(path,"^[a-z]*$")){ maps.put(key,map.get(key).toString()); }else if(checkRegex(path, "^previousElementSibling.*$")) {/*绿盟的特殊处理 同级的前一个节点 */ maps = changeForGree(maps,key, node,path); }else if(checkRegex(path, "^nextElementSibling.*$")){ maps = changeForGree2(maps,key, node,path); }else{ if(getNodes(node,map.get(key).toString()).size()>1){ manyIps=true; manyNames.add(key); }else{ maps.put(key,replaceHtml(getNode(node,map.get(key).toString()).html())); } } } /*此情况是ips有多条数据的*/ if(manyIps){ for (String key : manyNames) { List<Element> nodelist=getNodes(node,map.get(key).toString()); for(Element n : nodelist){ maps.put(key,replaceHtml(n.html())); list.add(maps); } } }else{ list.add(maps); } } } } } catch (XpathSyntaxErrorException e) { log.info("parse error !"); e.printStackTrace(); } return list; } public static Map changeForGree(Map<String, Object> map,String key, Element node,String path) { Element node1 = node.previousElementSibling(); map.put(key, getNode(node1, path.replace("previousElementSibling", "")).html().toString()); return map; } public static Map changeForGree2(Map<String, Object> map,String key, Element node,String path) { Element node1 = getNode(node, path.replace("nextElementSibling", "")); map.put(key,node1.nextElementSibling().html() ); return map; } /** * 替换掉HTML标签方法 */ public static String replaceHtml(String html) { if ("".equals(html)){ return ""; } String regEx = "<.+?>"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(html); String s = m.replaceAll(""); return s; } public static boolean checkRegex(String value,String regex){ Pattern p=Pattern.compile(regex); Matcher matcher = p.matcher(value); boolean rs = matcher.matches(); return rs; } public static Element getNode(Element node,String xpath){ Element ele=new Element("null"); List<Element> list; try { list = XpathParser.compile(xpath).evaluateToElement(node); for (Element jxNode: list) { return jxNode; } } catch (XpathSyntaxErrorException e) { e.printStackTrace(); } return ele; } public static List<Element> getNodes(Element node,String xpath){ List<Element> jxNodeList=new ArrayList<Element>(); try { jxNodeList = XpathParser.compile(xpath).evaluateToElement(node); } catch (com.virjar.sipsoup.exception.XpathSyntaxErrorException e) { e.printStackTrace(); } return jxNodeList; }
public static void main(String[] args) throws Exception { // File file = new File("F:/OneKeyDownLoads/index.html"); Document doc = Jsoup.parse(file, "UTF-8"); Map<String, Object> map = new HashMap<String, Object>();/*根节点 用来循环的*/ map.put("rootpath", "//table[@id='vulDataTable']/tbody/tr[@class='even vh_ip']|//tr[@class='even vm_ip']|//tr[@class='even vl_ip']"); // map.put("desc", "//td//table/tbody/tr[2]/td[2]"); // map.put("solution","//td//table/tbody/tr[3]/td[2]"); // map.put("cve","//td//table/tbody/tr[7]/td[2]/a"); // map.put("cnnvd","//td//table/tbody/tr/td[text()*='CNNVD-']"); // map.put("cnvd","//td//table/tbody/tr/td[text()*='CNVD-']"); // map.put("type","host"); // map.put("name", "previousElementSibling//td[1]/a"); map.put("risk", "nextElementSibling//td//table/tbody/tr/td[text()*='威胁分值']"); // map.put("host", "//td//table/tbody/tr[1]/td[2]/a"); List<Map<String, Object>> list = paserHtml(file, "rootpath", map); System.out.println(JSON.toJSON(list)); System.out.println(list.size()); //System.out.println(checkRegex("previousElementSibling//td[1]/a", "^previousElementSibling.*$")); }}