java 网络爬虫爬取图书馆的指定字段信息详解

首先创建一个maven工程

项目结构示意图:
在这里插入图片描述

1.pom.xml中加入

org.springframework.boot
spring-boot-starter-web


org.apache.httpcomponents
httpclient
4.1.2


org.jsoup
jsoup
1.7.3

存储数据的实体类book:
package model;

public class Book {
private String bookID;
private String bookName;
private String bookPrice;
private String bookAuthor;
private String bookPulish;
private String bookYear;

public String getBookID() {
	return bookID;
}
public void setBookID(String bookID) {
	this.bookID = bookID;
}
public String getBookName() {
	return bookName;
}
public void setBookName(String bookName) {
	this.bookName = bookName;
}
public String getBookPrice() {
	return bookPrice;
}
public void setBookPrice(String bookPrice) {
	this.bookPrice = bookPrice;
}
public String getBookAuthor() {
	return bookAuthor;
}
public void setBookAuthor(String bookAuthor) {
	this.bookAuthor = bookAuthor;
}
public String getBookPulish() {
	return bookPulish;
}
public void setBookPulish(String bookPulish) {
	this.bookPulish = bookPulish;
}
public String getBookYear() {
	return bookYear;
}
public void setBookYear(String bookYear) {
	this.bookYear = bookYear;
}

}

httpUtil:
package util;

import java.io.IOException;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.message.BasicHttpResponse;
public class httpUtil{
public static HttpResponse getHtml(HttpClient httpclient, String url) throws IOException
{
HttpGet getMethod = new HttpGet(url); //get方法
HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,HttpStatus.SC_OK,“ok”); //response初始化
response = httpclient.execute(getMethod); //执行get方法
return response;
}

}

URLEntity:
package util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.util.EntityUtils;

import parse.bookPrase;//?
import model.Book;

public class URLEntity {
public static List URLParse(HttpClient httpclient,String url) throws IOException
{
List getbooks = new ArrayList();
HttpResponse response = httpUtil.getHtml(httpclient, url);
int statusCode = response.getStatusLine().getStatusCode(); //获取状态码
if(statusCode == 200) //200为正常
{
String entity = EntityUtils.toString(response.getEntity(),“utf-8”);
getbooks = bookPrase.getData(entity);
EntityUtils.consume(response.getEntity()); //消耗实体类,实体类最后需要消耗
}
else
EntityUtils.consume(response.getEntity());

    return getbooks;
}

}

连接数据库:
Mysql_source:
package db;

import javax.sql.DataSource;

import org.apache.tomcat.dbcp.dbcp2.BasicDataSource;

public class mysql_source {
public static DataSource getDataSource(String connectURI)
{
BasicDataSource ds = new BasicDataSource();
ds.setDriverClassName(“com.mysql.jdbc.Driver”);
ds.setUsername(“root”);
//ds.setPassword("");
ds.setUrl(connectURI);
return ds;
}

}

参数的传递和sql语句的插入mysql_control:
package db;

import java.sql.SQLException;
import java.util.List;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;

import model.Book;

public class mysql_control {
static DataSource ds = mysql_source.getDataSource(“jdbc:mysql://127.0.0.1:3306/book”);
static QueryRunner qr = new QueryRunner(ds);

public static void executeInsert(List<Book> bookdatas) throws SQLException
{
    Object[][] params = new Object[bookdatas.size()][5];
    for(int i=0; i<params.length; i++)
    {
        params[i][0] = bookdatas.get(i).getBookID();
        params[i][1] = bookdatas.get(i).getBookName();
        params[i][2] = bookdatas.get(i).getBookAuthor(); 
        params[i][3] = bookdatas.get(i).getBookPulish(); 
        params[i][4] = bookdatas.get(i).getBookYear(); 
    }
    qr.batch("insert into bok_book(book_no,book_name,book_author,book_house,book_memo)values(?,?,?,?,?)", params);
    System.out.println("成功插入" + bookdatas.size() + "条");
}

}

测试类bookmain:
package mian;

import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;

import util.URLEntity;
import db.mysql_control;
import model.Book;

public class bookMain{
static final Log logger = LogFactory.getLog(bookMain.class); //log4j

public static void main(String[] args) throws Exception {

    HttpClient httpclient = new DefaultHttpClient(); //创建HttpClient
    String url = "***"; //种子
    List<Book> books = URLEntity.URLParse(httpclient, url); //通过URLEntity获取实体中的信息
    for (Book book : books) {
        logger.info("bookId:" + book.getBookID() + "\t" + "bookName:" + book.getBookName() + "\t" + "bookPrice:"
                + book.getBookPrice() + "\t"+"bookAuthor:" + book.getBookAuthor() + "\t" +"bookPulish:" + book.getBookPulish() + "\t");

    }
    mysql_control.executeInsert(books);  //数据库添加数据
}

}

执行结果:
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值