爬虫不多做介绍,本文用于对自己入门爬虫的记录
JDK爬虫简单模板
import org.junit.Test;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class JDKSpider {
//get请求
@Test
public void JDKGet() throws Exception {
//网址
String indexUrl = "https://www.baidu.com/";
URL url = new URL(indexUrl);
//打开连接
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//设定请求,默认GET
conn.setRequestMethod("GET");
//读取数据
InputStream in = conn.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
}
//关闭资源
in.close();
}
//post请求获取
@Test
public void JDKPost() throws Exception{
String indexUrl = "https://www.baidu.com/";
URL url = new URL(indexUrl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//提交请求方式是POST
conn.setRequestMethod("POST");
//开启输出流
conn.setDoOutput(true);
OutputStream out = conn.getOutputStream();
out.write("sasdadad".getBytes());
//读取数据
InputStream in = conn.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String line;
while ((line = br.readLine())!=null){
System.out.println(line);
}
out.close();
in.close();
}
}
Jsoup爬虫模板
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
public class JsoupDemo {
@Test
public void method() throws Exception{
// static Connection connect(String url) 创建并返回url的连接
String url = "https://www.baidu.com";
Connection conn = Jsoup.connect(url);
Document dom1 = conn.get();
System.out.println(dom1);
// static Document parse(File in, String charsetName) 将指定的字符集文件解析成文档
// static Document parse(String html) 将给定的html代码解析成文档
}
@Test
public void demo02() throws Exception {
// 目标:查看淘宝网首页的一些分类
// 1 确定访问的url
String indexUrl = "https://www.taobao.com//";
// 2 发送请求 获取响应数据的文档对象
Document dom = Jsoup.connect(indexUrl).get();
// 3 通过选择器获取标签
Elements as = dom.select(".service-bd>li>a");
// System.out.println(as);
// 4 遍历标签集合的内容
for (Element a : as) {
System.out.println(a.text());
}
}
}
爬虫实例1,循环爬取页面数据
项目的结构
1. pom文件中导入(mysql8.0版本,依赖版本需要添加,本项目有父工程因此无需添加版本)
<dependencies>
<!--junit-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<!--数据库连接-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
</dependency>
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
</dependency>
<!--爬虫-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>
</dependencies>
2. pojo类,封装(成员变量与数据库建表字段名一致)
package com.zyx.jd_spider.pojo;
public class Item {
public Integer id;
public Long spu;
public Long sku;
public String title;
public Double price;
public String pic;
public String url;
public String created;
public String updated;
public Item() {
}
public Item(Integer id, Long spu, Long sku, String title, Double price, String pic, String url, String created, String updated) {
this.id = id;
this.spu = spu;
this.sku = sku;
this.title = title;
this.price = price;
this.pic = pic;
this.url = url;
this.created = created;
this.updated = updated;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public Long getSpu() {
return spu;
}
public void setSpu(Long spu) {
this.spu = spu;
}
public Long getSku() {
return sku;
}
public void setSku(Long sku) {
this.sku = sku;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Double getPrice() {
return price;
}
public void setPrice(Double price) {
this.price = price;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getCreated() {
return created;
}
public void setCreated(String created) {
this.created = created;
}
public String getUpdated() {
return updated;
}
public void setUpdated(String updated) {
this.updated = updated;
}
@Override
public String toString() {
return "Item{" +
"id=" + id +
", spu=" + spu +
", sku=" + sku +
", title='" + title + '\'' +
", price=" + price +
", pic='" + pic + '\'' +
", url='" + url + '\'' +
", created='" + created + '\'' +
", updated='" + updated + '\'' +
'}';
}
}
3. dao
package com.zyx.jd_spider.dao;
import com.zyx.jd_spider.pojo.Item;
import com.zyx.utils.DruidUtils;
import org.apache.commons.dbutils.QueryRunner;
public class ItemDao {
/*
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(1000) DEFAULT NULL COMMENT '商品标题',
`price` double(10,0) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(1500) DEFAULT NULL COMMENT '商品详情地址',
`created` varchar(100) DEFAULT NULL COMMENT '创建时间',
`updated` varchar(100) DEFAULT NULL COMMENT '更新时间',
*/
//增加数据到数据库
public void add(Item item) throws Exception{
QueryRunner qr = new QueryRunner(DruidUtils.getDataSource());
String sql = "insert into jd_item values (null,?,?,?,?,?,?,?,?)";
qr.update(sql,item.getSpu(),item.getSku(),item.getTitle(),item.getPrice(),item.getPic(),item.getUrl(),item.getCreated(),item.getUpdated());
}
}
4. druid配置
# mysql5 和 mysql8 驱动不一样,在此使用的是mysql8版本
driverClassName=com.mysql.cj.jdbc.Driver
url=jdbc:mysql://localhost:3306/spiderData?useUnicode=true&serverTimezone=GMT&useSSL=false&allowPublicKeyRetrieval=true
username=root
password=******
# 初始化连接数量
initialSize=5
# 最大连接数
maxActive=10
# 最大超时时间
maxWait=3000
5. 自定义druid工具类
package com.zyx.utils;
import com.alibaba.druid.pool.DruidDataSourceFactory;
import javax.sql.DataSource;
import java.io.InputStream;
import java.sql.Connection;
import java.util.Properties;
public class DruidUtils {
static DataSource dataSource = null;
static {
try {
Properties prop = new Properties();
InputStream in = DruidUtils.class.getClassLoader().getResourceAsStream("druid.properties");
prop.load(in);
dataSource = DruidDataSourceFactory.createDataSource(prop);
} catch (Exception e) {
e.printStackTrace();
}
}
public static Connection getConnection() throws Exception{
return dataSource.getConnection();
}
public static DataSource getDataSource(){
return dataSource;
}
}
6.爬取数据
package com.zyx.jd_spider.main;
import com.zyx.jd_spider.dao.ItemDao;
import com.zyx.jd_spider.pojo.Item;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.UUID;
public class JDSpider {
@Test
public void test() throws Exception{
//创建浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
int page = 1;
while (page < 10){
System.out.println(page);
Thread.sleep(1000);
//爬虫地址
String IndexUrl = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.his.0.0&wq=%E6%89%8B%E6%9C%BA&page=" + (2 * page -1) + "&click=0";
//返回响应
CloseableHttpResponse response = getCloseableHttpResponse(httpClient, IndexUrl);
//获取html
String html = getHtml(response);
if (html !=null && !html.equals("")){
//解析数据
List<Item> list = getHtml2List(httpClient, html);
ItemDao itemDao = new ItemDao();
for (Item item : list) {
itemDao.add(item);
}
}
page++;
}
//关闭浏览器
httpClient.close();
}
public static String getHtml(CloseableHttpResponse response) throws IOException {
int statusCode = response.getStatusLine().getStatusCode();
String html = null;
//判断
if (statusCode == 200){
//获取响应体
HttpEntity entity = response.getEntity();
html = EntityUtils.toString(entity);
response.close();
}
return html;
}
public static CloseableHttpResponse getCloseableHttpResponse(CloseableHttpClient httpClient, String indexUrl) throws IOException {
//创建get请求
HttpGet httpGet = new HttpGet(indexUrl);
//设置请求头
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36");
//发送请求,获取响应
return httpClient.execute(httpGet);
}
public static List<Item> getHtml2List(CloseableHttpClient httpClient, String html) throws Exception {
List<Item> list = new ArrayList<>();
//解析html
Document dom = Jsoup.parse(html);
Elements elements = dom.select(".gl-warp.clearfix>.gl-item");
for (Element element : elements) {
String url ="http:" + element.select(".gl-i-wrap>.p-img>a>img").attr("data-lazy-img");
String pic = "d:/test/jd/"+ UUID.randomUUID()+"."+url.substring(url.length()-3);
//发送图片请求,下载图片
CloseableHttpResponse picResponse = getCloseableHttpResponse(httpClient, url);
//获取图片响应体
HttpEntity entity = picResponse.getEntity();
InputStream in = entity.getContent();
//下载图片
BufferedInputStream bis = new BufferedInputStream(in);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(pic));
int len;
byte[] bytes = new byte[1024 * 8];
while ((len = bis.read(bytes))!=-1){
bos.write(bytes,0,len);
}
//关流
bis.close();
bos.close();
picResponse.close();
//spu
String spu = element.attr("data-spu");
if (spu.equals("")){
spu = "0";
}
//sku
String sku = element.attr("data-sku");
//获取商品名字
String title = element.select(".gl-i-wrap>.p-name.p-name-type-2 em").text();
//获取价格
String price = element.select(".gl-i-wrap>.p-price i").text();
String created = new Date().toLocaleString();
String updated = new Date().toLocaleString();
//封装到Item对象中,写入mysql
Item item = new Item(1,Long.valueOf(spu+""),Long.valueOf(sku),title,Double.parseDouble(price),pic,url,created,updated);
list.add(item);
}
return list;
}
}
爬虫实例2,线程池爬取数据(实例1的改良版)
7. 采用线程池爬取数据(添加一个线程监控线程池)
package com.zyx.jd_spider.main;
import com.zyx.jd_spider.dao.ItemDao;
import com.zyx.jd_spider.pojo.Item;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.*;
import java.util.*;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
public class JDSpider2 {
public static void main(String[] args) throws IOException {
//创建浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
LinkedBlockingDeque<String> list = new LinkedBlockingDeque<>();
for (int i = 1; i <= 50; i++) {
String IndexUrl = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.his.0.0&wq=%E6%89%8B%E6%9C%BA&page=" + (2 * i -1) + "&click=0";
list.add(IndexUrl);
}
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(10);
/* executor.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
int size = list.size();
if (size <= 0) {
executor.shutdown();
}
}
}, 3, 2, TimeUnit.SECONDS);*/
for (int i = 0; i < 5; i++) {
executor.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
try {
//判断
if (list.size()>0){
//获取响应对象
String indexUrl = list.remove();
CloseableHttpResponse response = getCloseableHttpResponse(httpClient, indexUrl);
//获取html文档
String html = getHtml(response);
if (html !=null && !html.equals("")){
ItemDao itemDao = new ItemDao();
//解析数据
List<Item> itemList = getHtml2List(httpClient, html);
for (Item item : itemList) {
itemDao.add(item);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
},0,5,TimeUnit.SECONDS);
}
//监控线程池,当集合中没有url时,关闭线程池,等待线程全部空闲后关闭httpclient连接
new Thread(new Runnable() {
boolean flag = true;
@Override
public void run() {
while (flag){
try {
//判断
if (list.size()<=0) {
executor.shutdown();
}
//线程处于等待状态
Thread.sleep(20000);
// System.out.println("我现在"+(executor.isTerminated()?"可以":"还不可以")+"关闭浏览器啊,还差"+list.size());
//当线程池中全部线程空闲,关闭连接,并终止此线程
if (executor.isTerminated()){
flag = false;
httpClient.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}).start();
}
public static CloseableHttpResponse getCloseableHttpResponse(CloseableHttpClient httpClient, String indexUrl) throws IOException {
//创建get请求
HttpGet httpGet = new HttpGet(indexUrl);
//设置请求头
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36");
//发送请求,获取响应
return httpClient.execute(httpGet);
}
public static String getHtml(CloseableHttpResponse response) throws IOException {
int statusCode = response.getStatusLine().getStatusCode();
String html = null;
//判断
if (statusCode == 200){
//获取响应体
HttpEntity entity = response.getEntity();
html = EntityUtils.toString(entity);
response.close();
}
return html;
}
//解析html,添加数据到mysql
public static List<Item> getHtml2List(CloseableHttpClient httpClient, String html) throws Exception {
Document dom = Jsoup.parse(html);
Elements elements = dom.select(".gl-warp.clearfix>.gl-item");
List<Item> list = new ArrayList<>();
for (Element element : elements) {
String url ="http:" + element.select(".gl-i-wrap>.p-img>a>img").attr("data-lazy-img");
String pic ="";
//发送图片请求,下载图片
CloseableHttpResponse picResponse = getCloseableHttpResponse(httpClient, url);
if (picResponse.getStatusLine().getStatusCode() == 200){
pic = "d:/test/jd/"+ UUID.randomUUID()+"."+url.substring(url.length()-3);
//获取图片响应体
HttpEntity entity = picResponse.getEntity();
//下载图片
InputStream in = entity.getContent();
BufferedInputStream bis = new BufferedInputStream(in);
OutputStream out = new FileOutputStream(pic);
BufferedOutputStream bos = new BufferedOutputStream(out);
int len;
byte[] bytes = new byte[1024 * 16];
while ((len = in.read(bytes))!=-1){
out.write(bytes,0,len);
}
//关流
in.close();
out.close();
bis.close();
bos.close();
picResponse.close();
}
//spu
String spu = element.attr("data-spu");
if (spu.equals("")){
spu = "0";
}
//sku
String sku = element.attr("data-sku");
//获取商品名字
String title = element.select(".gl-i-wrap>.p-name.p-name-type-2 em").text();
//获取价格
String price = element.select(".gl-i-wrap>.p-price i").text();
String created = new Date().toLocaleString();
String updated = new Date().toLocaleString();
//封装到Item对象中
Item item = new Item(1,Long.valueOf(spu+""),Long.valueOf(sku),title,Double.parseDouble(price),pic,url,created,updated);
list.add(item);
}
return list;
}
}