xpath抓取页面内容

直接上代码


import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class JsoupHelper {
    
    public static Object fecthNode(String url, String xpath) throws Exception {
        String html = null;
        try {
            Connection connect = Jsoup.connect(url);
            html = connect.get().body().html();
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
       // System.err.println(html);
        HtmlCleaner hc = new HtmlCleaner();
        TagNode tn = hc.clean(html);
        Document dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
        XPath xPath = XPathFactory.newInstance().newXPath();
        
        Object result = xPath.evaluate(xpath, dom, XPathConstants.NODESET);
        
        return result;
    }
    
    public static List<GoodsTemp> fecth2Object(String url, String xpath) throws Exception {
        List<GoodsTemp> list = new ArrayList<GoodsTemp>();
        
        Object result = fecthNode(url, xpath);
        
        if (result instanceof NodeList) {
            NodeList nodeList = (NodeList) result;
            
            for (int i = 0; i < nodeList.getLength(); i++) {
            	GoodsTemp goods = new GoodsTemp();
                Node node = nodeList.item(i);
                if(node == null){
                    continue;
                }
                String detail = node.getAttributes().getNamedItem("href").getTextContent();
				/*nodeMap.put(node.getTextContent(), node.getAttributes().getNamedItem("href")!=null ? 
                        detail : "");*/
                goods.setName(node.getTextContent());
                goods.setDetailUrl(detail);
                goods.setId(detail .replace("/products/", ""));
                list.add(goods);
            }
        }
        
        return list;
    }
  
    public static Map<String, String> fecthByMap(String url, String xpath) throws Exception {
        Map<String, String> nodeMap = new LinkedHashMap<>();
        
        Object result = fecthNode(url, xpath);
        
        if (result instanceof NodeList) {
            NodeList nodeList = (NodeList) result;
            
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                if(node == null){
                    continue;
                }
                nodeMap.put(node.getTextContent(), node.getAttributes().getNamedItem("href")!=null ? 
                        node.getAttributes().getNamedItem("href").getTextContent() : "");
                
                //System.out.println(node.getTextContent() + " : " + node.getAttributes().getNamedItem("href"));
            }
        }
        
        return nodeMap;
    }
    /**
     *获取xpath下的某个属性值
    **/
    public static List<String> fecthAttr(String url, String xpath, String attr) throws Exception {
        List<String> list = new ArrayList<>();
        
        Object result = fecthNode(url, xpath);
        
        if (result instanceof NodeList) {
            NodeList nodeList = (NodeList) result;
            
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                if(node == null){
                    continue;
                }
                list.add(node.getAttributes().getNamedItem(attr).getTextContent());
                
                //System.out.println(node.getTextContent() + " : " + node.getAttributes().getNamedItem("href"));
            }
        }
        
        return list;
    }
    
    public static void main(String[] args) throws Exception {
    	/*String xpath = "//div[@class='gp-list-view-search']/ul/li//div[@class='info']/div[@class='title']/a[1]";
    	Map map = fecthByMap ("", xpath);
    	//Map map =  fecthByMap("","//ul[@class='note-list']/li//a[@class='title']");
    	Iterator<Map.Entry<String, String>> it=map.entrySet().iterator();
    	while(it.hasNext()){
    	 Map.Entry<String, String> entry=it.next();
    	   System.out.println("键key :"+entry.getKey()+" value :"+entry.getValue());
    	  }
    	 System.out.println("====================================");*/
    	String xpath = "//div[@class='gp-list-view-search']/ul/li//div[@class='info']/div[@class='title']/a[1]";
    	List<GoodsTemp>  list = fecth2Object("", xpath);
    	for (GoodsTemp goodsTemp : list) {
			System.err.println(goodsTemp);
		}
    }
}

maven配置如下:

    <dependency> 
      <groupId>org.slf4j</groupId> 
      <artifactId>slf4j-jdk14</artifactId> 
      <version>1.4.3</version> 
    </dependency>
  <dependency>    
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.3</version>
</dependency>
<dependency>  
    <groupId>javax.xml</groupId>  
    <artifactId>jaxp-api</artifactId>  
    <version>1.4.2</version>  
</dependency>  
<dependency>  
    <groupId>net.sourceforge.htmlcleaner</groupId>  
    <artifactId>htmlcleaner</artifactId>  
    <version>2.9</version>  
</dependency> 

有时会出现获取不到问题,主要就是xpath有问题,有可能直接在页面看到的html和通过程序返回的并不完全一致,建议根据程序返回的写xpath

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值