htmlcleaner+xpath抓取网页数据

本文介绍了如何利用HtmlCleaner库配合XPath表达式来定位并提取网页中的数据。通过下载HtmlCleaner的jar包,并编写代码,可以有效地获取所需信息。实践过程简单明了,适用于网页数据抓取任务。
摘要由CSDN通过智能技术生成

下载htmlcleaner的jar包!用XPath定位元素位置!

运行以下代码!

 

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

public class HtmlCleanerDemo {
	
	public void test2() throws MalformedURLException, IOException, XPatherException{
		
		HtmlCleaner cleaner=new HtmlCleaner();
		
		TagNode node=cleaner.clean(new URL("http://shop.ali213.com/products_all_big.php"));
		
		Object[] ns=node.evaluateXPath("//div[@class='all_products_list']//*//a[@href]");
		
		for (Object object : ns) {
			
			TagNode node2=(TagNode) object;
			System.out.println("超链接地址:"+node2.getAttributeByName("href")+"链接标题:"+node2.getAttributeByName("title"));
			
		}
		
//		//
//Object[] ns2=node.evaluateXPath("//div[@class='all_products_list']//*//font[@style='font-family:Arial, Helvetica, sans-serif;font-size:14px;color:#FF2900;font-weight:bold;']");
//		
//		for (Object object : ns2) {
//			
//			TagNode node2=(TagNode) object;
//			System.out.println("价格:"+node2.getText());
//			
//		}
		
	}
	
	public static void main(String[] args) throws IOException, XPatherException {
		
		//GenXml.createXml(new URL("http://osa.tmall.com/"));
		
		HtmlCleaner cleaner=new HtmlCleaner();
		
		int imgs=0;
		int as=0;
		
		// 按tag取
		 //TagNode node=cleaner.clean(new File("F://lianxi//HtmlCleaner1//src//test//html//1.html"),"utf-8");
///	TagNode node=cleaner.clean(new File("basic.xml"),"utf-8");
	 	TagNode node=cleaner.clean(new URL("http://osa.tmall.com/"),"gbk");
		Object[] ns=node.getElementsByName("title", true);
		
		if(ns.length>0){
			
			System.out.println("title是:"+((TagNode) ns[0]).getText());//取title值
		}
		
		// 按xpath取
		System.out.println("图片路径:img/src:" );
		//ns=node.evaluateXPath("//div[@class='custom-area']//*//img[@src]");
		ns=node.evaluateXPath("//div[@class='box J_TBox tshop-pbsm tshop-pbsm-ssd10c']//*//img[@src]");
		
		
		for (Object on: ns) {
			TagNode n= (TagNode) on;
//			System.out.println("\ttext:"+n.getText());
			System.out.println("图片路径:"+n.getAttributeByName("src")+"\n");
			imgs++;
		}
		System.out.println("共"+imgs+"个");
		System.out.println("--------------------------------");
		System.out.println("获取超链接的地址:a");
		ns=node.evaluateXPath("//div[@class='box J_TBox tshop-pbsm tshop-pbsm-ssd10c']//*//a[@href]");
		for (Object object : ns) {
			TagNode node2=(TagNode) object;
			System.out.println("超链接的链接地址:"+node2.getAttributeByName("href")+"\n");
			as++;
			//System.out.println(cleaner.getInnerHtml(node2));
		}
		
		System.out.println("超链接共:"+as+"个");
		
//		// 按屬性取
//		System.out.println("a:");
//		ns=node.getElementsByAttValue("href", "", true, true);
//		for (Object on : ns) {
//			TagNode node2=(TagNode) on;
//			System.out.println("\thref="+node2.getAttributeByName("href")+",text="+node2.getText());
//					
//		}
		
		
		
		HtmlCleanerDemo demo= new HtmlCleanerDemo();
		
		demo.test2();
	}

}


结果!


title是:
       O.SA 首页-osa品牌服饰旗舰店- 淘宝商城    
图片路径:img/src:
图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2nLedXlhXXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2sLadXmxXXXXXXXXX_%21%2194153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2yvadXmpXXXXXXXXX_%21%2194153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2819dXm8XXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2.19dXm4XXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2nfydXhRaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2GvydXhpaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2LLydXhdaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2Y2ydXg0aXXXXXXXX_%21%2194153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2HLudXiNaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2vvudXi8aXXXXXXXX_%21%2194153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2O2udXiFaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T21fudXitaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T242udXilaXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2hLydXh0aXXXXXXXX_%21%2194153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T232adXlNXXXXXXXXX_%21%2194153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2QA9eXbpaXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2I9SdXapXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2zpV5XatXXXXXXXXX_%21%2194153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2jdmeXepaXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2nJmeXelaXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2r0meXedaXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2oDecXftaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2ehGeXbRXXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2iNGeXbJXXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2u4GeXbpXXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2xxGeXbhXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2L4GeXa0XXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2PWOcXfpXXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2PaOcXfpXXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T221ieXoXaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T22uieXoXaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2YlmeXf8XXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2femeXnRaXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2v1meXnxaXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2yumeXnpaXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2E1meXnhaXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2RKmeXm4aXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2VumeXmVaXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2_emeXmxaXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2buqeXmpaXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T27KOeXlhaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2oKqeXl8aXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2weqeXl0aXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T22gWeXa8XXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2LlSdXe4aXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2gJeeXiVaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2xyieXXhXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T28cxZXapcXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2LdWeXeRaXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2Tr1bXXlbXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2IH1bXXBbXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2coibXe4aXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2c.ibXe0aXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2soibXeBaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T26UibXdFaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T27EibXdJaXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2nUmbXdlaXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2efieXidXXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2SF4tXdtNXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2z2ieXh4XXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2i4SeXhNXXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2dvieXhRXXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2O2ieXhJXXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taobaocdn.com/imgextra/i2/94153930/T2T2ieXhFXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T24vieXhtXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T28fieXhlXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2fvmeXhXXXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2f2meXg8XXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T203aeXcdaXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2f.KbXmVXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2I.GbXmVXXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2lUCbXmlXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2H.KbXl8XXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2OoKbXlVXXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2ZUKbXlxXXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T21tOeXcJbXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2m1eeXcJbXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T2EeeeXclbXXXXXXXX_!!94153930.jpg

图片路径:http://img03.taobaocdn.com/imgextra/i3/94153930/T2EueeXchbXXXXXXXX_!!94153930.jpg

图片路径:http://img04.taobaocdn.com/imgextra/i4/94153930/T2ZeeeXbNbXXXXXXXX_!!94153930.jpg

图片路径:http://img01.taobaocdn.com/imgextra/i1/94153930/T221eeXbRbXXXXXXXX_!!94153930.jpg

图片路径:http://img02.taoba
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值