简单的html解析工具类

最近在做一个项目,要求解析html报告里的数据,报告的规格也都不一样。就写了个简单的工具类

用到的技术是结合了 jsoup、sipsoup包的使用


maven地址:

 <!-- jsoup Xpath  解析html -->

   <dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
    </dependency>
    <dependency>
     <groupId>com.virjar</groupId>
         <artifactId>sipsoup</artifactId>

        <version>RELEASE</version>

   </dependency>

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.virjar.sipsoup.exception.XpathSyntaxErrorException;
import com.virjar.sipsoup.parse.XpathParser;

/**   
* @Title: ParseHtmlUtil.java 
* @Package com.bluedon.track.util 
* @Description: 解析html漏洞数据工具类
* @author   
* @date 2017年7月12日 下午3:32:56 
* @version V1.0   
*/
public class ParseHtmlUtil {

	private final static Logger log = Logger.getLogger(ParseHtmlUtil.class);

	/**
	 * @Title: paserHtml
	 * @Description: 解析html漏洞数据
	 * @param filePath
	 *            解析的html地址
	 * @param rootName
	 *            解析的根节点名称
	 * @param map
	 *            key->名称,value->解析地址规则
	 * @return List<Map<String,Object>>
	 * @throws IOException 
	 */
	public static List<Map<String, Object>> paserHtml(File file, String rootName, Map<String, Object> map) throws IOException {
		List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
	  	Document doc = Jsoup.parse(file, "UTF-8");
			String rootPath = map.get(rootName).toString();
			map.remove(rootName);
			try {
				List<Element> eles=XpathParser.compile(rootPath).evaluateToElement(doc);
				if(null!=eles&&eles.size()>0){
				for (Element node  : eles) {
						if (null != map || map.size() >0) {
							Set<String> keySet = map.keySet();
							Map<String,Object> maps=new HashMap<String,Object>();
							/*此情况是ips有多条数据的*/
							boolean manyIps=false;
							Set<String> manyNames=new HashSet<String>();
			 				for (String key : keySet) {
			 					String path=map.get(key).toString();
			 					/*过滤 路径为空的,仅仅是字母的  */
			 					if(path.equals("")||checkRegex(path,"^[a-z]*$")){
			 						maps.put(key,map.get(key).toString());
			 					}else if(checkRegex(path, "^previousElementSibling.*$")) {/*绿盟的特殊处理 同级的前一个节点 */
			 						maps = changeForGree(maps,key, node,path);
			 					}else if(checkRegex(path, "^nextElementSibling.*$")){
									maps = changeForGree2(maps,key, node,path);
								}else{
			 						if(getNodes(node,map.get(key).toString()).size()>1){
				 						manyIps=true;
				 						manyNames.add(key);
				 					}else{
				 						maps.put(key,replaceHtml(getNode(node,map.get(key).toString()).html()));
				 					}
			 					}
			 				}
			 				/*此情况是ips有多条数据的*/
			 				if(manyIps){
			 					for (String key : manyNames) {
			 						List<Element> nodelist=getNodes(node,map.get(key).toString());
			 						for(Element n : nodelist){
			 								maps.put(key,replaceHtml(n.html()));
			 								list.add(maps);
			 						}
			 					}
			 				}else{
			 					list.add(maps);
			 				}
						}
					}
				}
			} catch (XpathSyntaxErrorException e) {
				log.info("parse error !");
				e.printStackTrace();
			}
		return list;
	}

	public static Map changeForGree(Map<String, Object> map,String key, Element node,String path) {
		Element node1 = node.previousElementSibling();
		map.put(key, getNode(node1, path.replace("previousElementSibling", "")).html().toString());
		return map;
	}

	public static Map changeForGree2(Map<String, Object> map,String key, Element node,String path) {
		Element node1 = getNode(node, path.replace("nextElementSibling", ""));
		map.put(key,node1.nextElementSibling().html() );
		return map;
	}
	/**
	 * 替换掉HTML标签方法
	 */
	public static String replaceHtml(String html) {
		if ("".equals(html)){
			return "";
		}
		String regEx = "<.+?>";
		Pattern p = Pattern.compile(regEx);
		Matcher m = p.matcher(html);
		String s = m.replaceAll("");
		return s;
	}
	
	public static boolean checkRegex(String value,String regex){
		Pattern p=Pattern.compile(regex); 
		Matcher matcher = p.matcher(value);	
		boolean rs = matcher.matches();
		return rs;
	}
	
	public static Element getNode(Element node,String xpath){
		Element ele=new Element("null");
			List<Element> list;
			try {
				list = XpathParser.compile(xpath).evaluateToElement(node);
				for (Element jxNode: list) {
		        	return jxNode;
		        }
			} catch (XpathSyntaxErrorException e) {
				e.printStackTrace();
			}
		return ele;
	}

	public static List<Element> getNodes(Element node,String xpath){
		List<Element> jxNodeList=new ArrayList<Element>();
			try {
				jxNodeList = XpathParser.compile(xpath).evaluateToElement(node);
			} catch (com.virjar.sipsoup.exception.XpathSyntaxErrorException e) {
				e.printStackTrace();
			}
		return jxNodeList;
	}
public static void main(String[] args) throws Exception {
		//
		File file = new File("F:/OneKeyDownLoads/index.html");
		Document doc = Jsoup.parse(file, "UTF-8");

		Map<String, Object> map = new HashMap<String, Object>();
                /*根节点  用来循环的*/
		map.put("rootpath",
				"//table[@id='vulDataTable']/tbody/tr[@class='even vh_ip']|//tr[@class='even vm_ip']|//tr[@class='even vl_ip']");
		// map.put("desc", "//td//table/tbody/tr[2]/td[2]");
		// map.put("solution","//td//table/tbody/tr[3]/td[2]");
		// map.put("cve","//td//table/tbody/tr[7]/td[2]/a");
		// map.put("cnnvd","//td//table/tbody/tr/td[text()*='CNNVD-']");
		// map.put("cnvd","//td//table/tbody/tr/td[text()*='CNVD-']");
		// map.put("type","host");
//		 map.put("name", "previousElementSibling//td[1]/a");
		 map.put("risk", "nextElementSibling//td//table/tbody/tr/td[text()*='威胁分值']");
//		 map.put("host", "//td//table/tbody/tr[1]/td[2]/a");
		List<Map<String, Object>> list = paserHtml(file, "rootpath", map);
		System.out.println(JSON.toJSON(list));
		System.out.println(list.size());
		//System.out.println(checkRegex("previousElementSibling//td[1]/a", "^previousElementSibling.*$"));
	}}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值