一、概念
HttpClient 是Apache Jakarta Common 下的子项目,可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。
HttpClient实现了所有HTTP的方法(GET/POST/PUT/HEAD等),支持自动转向、HTTPS以及代理服务器功能。
二、HttpClient的使用
参考官网示例http://hc.apache.org/httpcomponents-client-4.5.x/quickstart.html
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpTest {
public static void main(String[] args){
CloseableHttpClient httpClient = HttpClients.createDefault();
String url = "https://www.zhihu.com/";
HttpGet httpGet = new HttpGet(url);
System.out.println(httpGet);
HttpPost httpPost = new HttpPost(url);
System.out.println(httpPost);
try {
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
System.out.println(httpResponse);
//获取响应码
int status = httpResponse.getStatusLine().getStatusCode();
System.out.println(status);
if(status == 200){
String entity = EntityUtils.toString(httpResponse.getEntity());
System.out.println(entity);
EntityUtils.consume(httpResponse.getEntity());
}else{
EntityUtils.consume(httpResponse.getEntity());
}
} catch (IOException e) {
e.printStackTrace();
}HttpResponse.close();
}
}
可以总结,使用HttpClient的步骤如下:
(1)创建HttpClient实例;
(2)通过HttpGet/HttpPost方法创建请求方法实例;
(3)调用HttpClient的execute()方法请求访问具体的资源,返回HttpResponse实例;
(4)对响应内容进行解析:响应状态码、响应内容、网页等内容进行解析,实现具体的操作;
(5)释放连接
三、具体实例
下面实现爬取京东书城上第一相页面关于Python书籍的ID、书名和价格,并将其存入到本地数据库的爬虫代码:
1、创建书籍实例类
/**
* 京东图书实体类
*/
public class Book {
//图书ID
private String bookID;
//图书名字
private String bookName;
//图书价格
private String bookPrice;
public String getBookID() {
return bookID;
}
public void setBookID(String bookID) {
this.bookID = bookID;
}
public String getBookName() {
return bookName;
}
public void setBookName(String bookName) {
this.bookName = bookName;
}
public String getBookPrice() {
return bookPrice;
}
public void setBookPrice(String bookPrice) {
this.bookPrice = bookPrice;
}
@Override
public String toString() {
return "Book{" +
"bookID='" + bookID + '\'' +
", bookName='" + bookName + '\'' +
", bookPrice='" + bookPrice + '\'' +
'}';
}
}
2、爬虫类
import JD.model.Book;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
public class BookCrawler {
static final Logger logger = LoggerFactory.getLogger(BookCrawler.class);
public static void main(String[] args) throws IOException, SQLException, ClassNotFoundException {
//创建HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
//要爬取的URL
String url = "http://search.jd.com/Search?keyword=Python&enc=utf-8&qrst=1&rt=1&stop=1&book=y&pt=1&vt=2&cid2=3287&stock=1&click=3";
//以Get方法请求页面内容
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
//爬取的图书列表
List<Book> books = new ArrayList<>();
//获取响应码
int statusCode = httpResponse.getStatusLine().getStatusCode();
if(statusCode == 200){
String entity = EntityUtils.toString(httpResponse.getEntity(), "utf-8");
//采用Jsoup解析抓取到的网页
Document doc = Jsoup.parse(entity);
//获取HTML标签中的内容
Elements elements = doc.select("ul[class=gl-warp clearfix]").select("li[class=gl-item]");
for (Element ele : elements){
String bookID = ele.attr("data-sku");
String bookPrice = ele.select("div[class=p-price]").select("strong").select("i").text();
String bookName = ele.select("div[class=p-name]").select("em").text();
//从中提取出书籍对象
Book book = new Book();
book.setBookID(bookID);
book.setBookName(bookName);
book.setBookPrice(bookPrice);
books.add(book);
}
}
EntityUtils.consume(httpResponse.getEntity());
if(exeInsertData(books))
logger.info("Insert success!");
else logger.info("Insert Fail!");
httpResponse.close();
}
/**
* 连接数据库,将爬取到的数据插入数据库中
* @param books
* @return
* @throws ClassNotFoundException
* @throws SQLException
*/
public static Boolean exeInsertData(List<Book> books) throws ClassNotFoundException, SQLException {
Class.forName("com.mysql.cj.jdbc.Driver");
Connection conn = DriverManager.getConnection("jdbc:mysql://127.0.0.1/learn?useUnicode=true&characterEncoding=utf-8&&useSSL=false&serverTimezone=UTC","root", "admin");
String sql = "insert into book (bookID, bookName, bookPrice) values (?, ?, ?)";
PreparedStatement exeUpdate = conn.prepareStatement(sql);
for(Book book : books){
exeUpdate.setString(1, book.getBookID());
exeUpdate.setString(2, book.getBookName());
exeUpdate.setString(3,book.getBookPrice());
if(exeUpdate.executeUpdate() == 0){
logger.info("insert fail");
return false;
}
}
exeUpdate.close();
conn.close();
return true;
}
}
结果数据库中插入如下数据:
1、http://hc.apache.org/httpcomponents-client-4.5.x/quickstart.html
2、http://blog.csdn.net/qy20115549/article/details/52203722#再看main方法
3、http://git.oschina.net/liaoshixiong/Spiderman2/tree/master/src/main/java/net/kernal/spiderman