使用htmlparser简单抓取京东图书信息存入数据库的小例子

在学习lucene的时候需要很大的数据,很大的数据怎么来,不可能手工一点输入,从网上找了个方法,抓取数据。

我把目标锁定在了京东商城的图书信息。下面是我抓取图书信息的一个简单例子,写的不是特别严谨,但对我学习来说足够了。如果每个页面都有效的话完全能达到80万的数据。我把抓到的信息存入我设计的数据库中。在处理过程中,把处理好的网址信息以及出错信息存入e:/360book/book1.txt,使用前在E盘创建一个360book的文件夹。

数据库表结构为

CREATE TABLE `books` (
   `id` bigint(20) NOT NULL AUTO_INCREMENT,
   `bookName` varchar(1024) DEFAULT NULL,
   `auther` varchar(1024) DEFAULT NULL,
   `booksName` varchar(1024) DEFAULT NULL,
   `publish` varchar(1024) DEFAULT NULL,
   `isbn` varchar(1024) DEFAULT NULL,
   `publishDate` varchar(1024) DEFAULT NULL,
   `edition` varchar(1024) DEFAULT NULL,
   `pages` varchar(1024) DEFAULT NULL,
   `frame` varchar(1024) DEFAULT NULL,
   `format` varchar(1024) DEFAULT NULL,
   `type` varchar(1024) DEFAULT NULL,
   `price` varchar(1024) DEFAULT NULL,
   PRIMARY KEY (`id`)
 ) ENGINE=InnoDB AUTO_INCREMENT=15779 DEFAULT CHARSET=utf8


例子中使用的插件是htmlparser,版本为2.0。


这是处理页面信息的一个Class.

package com;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.util.HashMap;
import java.util.Map;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;

public class Buy {

	public static void main(String []args) throws Exception {
		FileWriter fileWriter = null;
		PrintWriter printWriter = null;
		File file = new File("e:/360book/book1.txt");
		for(int i=2;i<=861852;i++){
			int t = 10000000 + i;
			String url = "http://book.360buy.com/"+t+".html";
			//System.out.println(url);
			try {
				Buy.getBook(url);
				fileWriter = new FileWriter(file,true);
				printWriter = new PrintWriter(fileWriter);
				printWriter.println(url+"\t is ok!");
				printWriter.close();
				fileWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
				try {
					fileWriter = new FileWriter(file,true);
					printWriter = new PrintWriter(fileWriter);
					printWriter.println(e);
					printWriter.close();
					fileWriter.close();
				} catch (IOException e1) {
					e1.printStackTrace();
				}
			}
		}
	}
	
	public static void getBook(String url) throws Exception {
		
		Parser parser = new Parser((HttpURLConnection)new URL(url).openConnection());
		parser.setEncoding("gb2312");
		NodeFilter nameFilter = new HasAttributeFilter("id","name");
		NodeList nodeList = parser.extractAllNodesThatMatch(nameFilter);
		String bookName = getBookName(nodeList);
		
		NodeFilter infoFilter = new HasAttributeFilter("id","summary");
		parser.reset();
		NodeList summaryList = parser.extractAllNodesThatMatch(infoFilter);
		Map<String,String> map = getSummary(summaryList);
		
		NodeFilter priceFilter = new HasAttributeFilter("id","book-price");
		parser.reset();
		NodeList priceList = parser.extractAllNodesThatMatch(priceFilter);
		String price = getPrice(priceList);
		Connection con = DBCon.getConnection();
		save(con,bookName,map,price);
		DBCon.closeConnection(con);
	}
	
	public static void save(Connection con ,String bookName,Map<String,String> map,String price)throws Exception {
		PreparedStatement pstmt = null;
		String sql = "insert into books (bookName,auther,booksName,publish,isbn,publishDate,edition,pages,frame,format,type,price) values(?,?,?,?,?,?,?,?,?,?,?,?)";
		pstmt = con.prepareStatement(sql);
		pstmt.setString(1, bookName);
		pstmt.setString(2, map.get("auther"));
		pstmt.setString(3, map.get("booksName"));
		pstmt.setString(4, map.get("publish"));
		pstmt.setString(5, map.get("isbn"));
		pstmt.setString(6, map.get("publishDate"));
		pstmt.setString(7, map.get("edition"));
		pstmt.setString(8, map.get("pages"));
		pstmt.setString(9, map.get("frame"));
		pstmt.setString(10, map.get("format"));
		pstmt.setString(11, map.get("type"));
		pstmt.setString(12, price);
		pstmt.executeUpdate();
		DBCon.closePreparedStatement(pstmt);
	}
	
	public static Map<String,String> getSummary(NodeList nodeList){
		
		Map<String,String> map = new HashMap<String,String>();
		nodeList = nodeList.elementAt(0).getChildren().extractAllNodesThatMatch(new TagNameFilter("li"));
		for(int i=0;i<nodeList.size();i++){
			switch(i){
			case 0:map.put("auther", getAuther(nodeList.elementAt(i)));break;
			case 1:map.put("booksName", getValue(nodeList.elementAt(i)));break;
			case 2:map.put("publish", getPublish(nodeList.elementAt(i)));break;
			case 3:map.put("isbn", getValue(nodeList.elementAt(i)));break;
			case 4:map.put("publishDate", getValue(nodeList.elementAt(i)));break;
			case 5:map.put("edition", getValue(nodeList.elementAt(i)));break;
			case 6:map.put("pages", getValue(nodeList.elementAt(i)));break;
			case 7:map.put("frame", getValue(nodeList.elementAt(i)));break;
			case 8:map.put("format", getValue(nodeList.elementAt(i)));break;
			case 9:map.put("type", getPublish(nodeList.elementAt(i)));break;
			}
		}
		
		return map;
	}
	
	public static String getValue(Node node){
		
		String temp = node.toHtml();
		return temp.substring(temp.indexOf("</span>")+7, temp.indexOf("</li>"));
	}
	
	public static String getPublish(Node node){
		
		String publish = "";
		NodeFilter nodeFilter = new TagNameFilter("a");
		NodeList nodeList = node.getChildren().extractAllNodesThatMatch(nodeFilter);
		if(nodeList!=null){
			for(int i=0;i<nodeList.size();i++){
				publish = publish + nodeList.elementAt(i).getChildren().elementAt(0).getText();
				if(i!=nodeList.size()-1){
					publish = publish + ",";
				}
			}
		}
		return publish;
	}
	
	public static String getAuther(Node node){
		
		String auther = ""; 
		NodeFilter nodeFilter = new TagNameFilter("a");
		NodeList nodeList = node.getChildren().extractAllNodesThatMatch(nodeFilter);
		if(nodeList!=null){
			for(int i=0;i<nodeList.size();i++){
				auther = auther + nodeList.elementAt(i).getChildren().elementAt(0).getText();
				if(i!=nodeList.size()-1){
					auther = auther +",";
				}
			}
		}
		return auther;
	}
	
	public static String getBookName(NodeList nodeList){
		
		String bookName = "";
		String temp = nodeList.elementAt(0).getChildren().toHtml();
		bookName = temp.substring(temp.indexOf("<h1>")+4, temp.indexOf("<span>"));
		
		return bookName;
	}
	
	public static String getPrice(NodeList nodeList){
		
		String price = "";
		NodeFilter childFilter = new TagNameFilter("del");
		NodeFilter nodeFilter = new HasChildFilter(childFilter);
		nodeList = nodeList.elementAt(0).getChildren().extractAllNodesThatMatch(nodeFilter);
		String temp = nodeList.toHtml();
		price = temp.substring(temp.indexOf("<del>")+5, temp.indexOf("</del>"));
		
		return price;
	}
}

DBCon.java
package com;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

//import javax.naming.Context;
//import javax.naming.InitialContext;
//import javax.sql.DataSource;

public class DBCon {
	public static Connection getConnection(){
		Connection con=null;
		//Context ctx;
		try {
			//ctx = new InitialContext();
			//DataSource ds = (DataSource) ctx.lookup("java:comp/env/jdbc/myhelp");
			//con = ds.getConnection();
			Class.forName("com.mysql.jdbc.Driver").newInstance();
			con=DriverManager.getConnection("jdbc:mysql://localhost:3306/test?user=root&password=root");
		} catch (Exception e) {
			e.printStackTrace();
		}
		return con;
	}
	
	public static void closeConnection(Connection con) {
		if(con!=null){
			try {
				con.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}
	
	public static void closeResultSet(ResultSet rs) {
		if(rs!=null){
			try {
				rs.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}
	
	public static void closePreparedStatement(PreparedStatement pstmt){
		if(pstmt!=null){
			try {
				pstmt.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}
	
	public static void closeStatement(Statement stmt){
		if(stmt!=null){
			try {
				stmt.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}
}




  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值