环境搭建
MySQL环境搭建
安装MySQL
MySQL安装包下载及安装介绍
创建数据库及数据表
CREATE DATABASE `jd_spider`;
USE `jd_spider`;
CREATE TABLE `t_jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(1000) DEFAULT NULL COMMENT '商品标题',
`price` double(10,0) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(1500) DEFAULT NULL COMMENT '商品详情地址',
`created` varchar(100) DEFAULT NULL COMMENT '创建时间',
`updated` varchar(100) DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`),
KEY `idx_sku` (`sku`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='京东商品';
创建本地文件夹存储图片文件
E:/BigData/listdata/MHD
java环境搭建
创建新maven项目不使用模板
编辑项目中的管理文件pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com. zkr</groupId>
<artifactId>jdSpider</artifactId>
<version>1.0-SNAPSHOT</version>
<!-- 依赖包导入-->
<dependencies>
<!-- 导入c3p0连接池包-->
<dependency>
<groupId>com.mchange</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.5.5</version>
<scope>compile</scope>
</dependency>
<!-- 导入mysql驱动包 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
<scope>compile</scope>
</dependency>
<!-- 导入junit包 -->
<dependency>
<groupId>Junit</groupId>
<artifactId>Junit</artifactId>
<version>4.12</version>
</dependency>
<!-- 导入lombok包 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
<!-- 导入httpClient包 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<!-- 导入jsoup包 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
<!-- 构建过程中的设置-->
<build>
<!-- 导入插件-->
<plugins>
<!-- 保证每次编译的文件的jdk版本为1.8-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
由于本地仓库中没有依赖包以及插件,点击右侧的Maven选项,再点击刷新联网下载依赖包以及插件到本地Maven仓库
在项目中创建c3p0地址池的配置文件c3p0.properties
在c3p0.properties中写入配置,c3p0的jar包会自动读取文件中的配置
#注册驱动
c3p0.driverClass=com.mysql.jdbc.Driver
#需要连接的数据库的地址
c3p0.jdbcUrl=jdbc:mysql://127.0.0.1:3306/jd_spider
#数据库的用户名
c3p0.user=root
#数据库的密码
c3p0.password=123456
#地址池的初始连接数目。Default: 3
c3p0.initialPoolSize=5
#连接池中保留的最大连接数。Default: 15
c3p0.maxPoolSize=10
#当连接池用完时客户端调用getConnection()后等待获取新连接的时间,超时后将抛出SQLException,如设为0则无限期等待。单位毫秒。Default: 0
c3p0.checkoutTimeout=3000
#最大空闲时间,超过空闲时间的连接将被丢弃。为0或负数则永不丢弃。默认为0。Default: 0
c3p0.maxIdleTime=10000
编写c3p0工具类
在项目中创建packages com.zkr.spider.jd.utils
在该packages下创建c3p0工具类 C3P0Utils
写入代码
package com.zkr.spider.jd.utils;
import com.mchange.v2.c3p0.ComboPooledDataSource;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class C3P0Utils {
//定义C3P0连接池对象
//用父类DataSource是为了以后连接池发生变更,只需要修改ComboPooledDataSource
private static DataSource ds =new ComboPooledDataSource();
//获取数据库连接对象
public static Connection getConnection(){
Connection conn = null;
try {
conn=ds.getConnection();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return conn;
}
//关闭数据库相关对象
public static void closeAll(Connection conn, Statement stmt, ResultSet rs){
if(rs!=null){
try {
rs.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
if(stmt!=null){
try {
stmt.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
if(conn!=null){
try {
conn.close();
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
}
}
添加实体类
通常当java程序要访问数据表时,会在java程序中创建一个和数据表对应该的实体类:
类名 == 表名
属性 == 字段
在项目中创建packages com.zkr.spider.jd.pojo
在该packages下创建实体类 Item
package com.zkr.spider.jd.pojo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
//使用lombok插件实现getter、setter、toString以空参和全参构造
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Item {
//商品id(主键)
private Long id;
//标准产品单位(商品集合)
private Long spu;
//库存量单位(最小品类单元)
private Long sku;
//商品标题
private String title;
//商品价格
private Double price;
//商品图片
private String pic;
//商品详情地址
private String url;
//创建时间
private String created;
//更新时间
private String updated;
}
项目开发
package com.zkr.spider.jd;
/**
* author:zkrsun
* create:2020-08-17
*/
import com.zkr.spider.jd.pojo.Item;
import com.zkr.spider.jd.utils.C3P0Utils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class JdSpider {
public static void main(String[] args) {
//设置初始查询的页码
int page = 1;
//设置初始html
String html = null;
//设置初始list集合
List<Item> itemsList = null;
//创建httpclient连接对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//执行循环获取20页的商品数据
while(page<=20){
//抓取数据的网页地址
String url = "https://search.jd.com/Search?keyword=%E7%A7%BB%E5%8A%A8%E7%A1%AC%E7%9B%98&page="+page+"&click=0";
//通过getHtml方法获取html
try {
html = getHtml(url,httpClient);
} catch (IOException e) {
e.printStackTrace();
}
if(html!=null){
//设置集合将获取到的对象保存到集合中
try {
itemsList = getList(html, httpClient);
} catch (IOException e) {
e.printStackTrace();
}
//将获取道德集合中的item对象中的信息上传到数据库中
try {
saveItem(itemsList);
page++;
} catch (SQLException throwables) {
throwables.printStackTrace();
}
}
}
try {
//关闭httpClient对象
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//获取html
public static String getHtml(String url, CloseableHttpClient httpClient) throws IOException {
String html = null;
//创建请求方式的对象
HttpGet httpGet = new HttpGet(url);
//设置请求头,模拟浏览器
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36");
//发送请求获取响应对象
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
//获取响应对象的状态码如果为“200”则获取该url的html
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = httpResponse.getEntity();
html = EntityUtils.toString(entity, "utf-8");
}
//释放资源
httpResponse.close();
return html;
}
//获取封装类item的集合
public static List<Item> getList(String html, CloseableHttpClient httpClient) throws IOException {
//将获取到的字符串对象解析成DOM对象
Document doc = Jsoup.parse(html);
//获取全部列表
Elements lis = doc.select("#J_goodsList>.gl-warp>li");
List<Item> itemsList = new ArrayList<>();
for (Element li : lis) {
//获取标准产品单位
String spu = li.attr("data-spu");
String sku = li.attr("data-sku");
if (spu.equals("")) {
spu = sku;
}
//获取商品标题
String title = li.select(".p-name em").text();
//获取商品价格
Double price = Double.valueOf(li.select(".p-price i").text());
//获取商品图片下载地址
String picUrl = li.select(".p-img>a>img").attr("src");
String picName = picUrl.substring(picUrl.lastIndexOf("/") + 1);
//设置本地存储图片位置
String pathName = "E:/BigData/listdata/MHD/" + picName;
//将图片上传到图片文件夹中
//创建请求方式的对象
HttpGet imgGet = new HttpGet("https:"+picUrl);
//设置请求头,模拟浏览器
imgGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36");
//发送请求头获取相应对象
CloseableHttpResponse imgResponse = httpClient.execute(imgGet);
//获取响应对象的状态码如果为“200”则获取该url的输入流
if (imgResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = imgResponse.getEntity();
InputStream input = entity.getContent();
FileOutputStream fos = new FileOutputStream(pathName);
byte[] bytes = new byte[1024];
int len = -1;
while ((len = input.read(bytes)) != -1) {
fos.write(bytes, 0, len);
}
//关闭输出流
fos.close();
//关闭输入流
input.close();
}
//释放响应对象资源
imgResponse.close();
//获取商品详情地址
String goodsUrl = "https" + li.select(".p-img>a").attr("href");
//创建SimpleDateFormat类指定时间格式
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
//获取创建时间
String created = sdf.format(new Date());
//获取更新时间
String updated = sdf.format(new Date());
//将获取到的移动硬盘信息存储到包装类item中再添加到列表中
itemsList.add(new Item(null, Long.valueOf(spu), Long.valueOf(sku), title, price, pathName, goodsUrl, created, updated));
}
System.out.println(itemsList);
return itemsList;
}
//上传数据到数据库
public static void saveItem(List<Item> itemsList) throws SQLException {
//从连接池中获取连接对象
Connection conn = C3P0Utils.getConnection();
//设置SQL语句
String sql = "insert into t_jd_item values (null,?,?,?,?,?,?,?,?)";
//创建预编译对象
PreparedStatement pstmt = conn.prepareStatement(sql);
//通过循环执行sql语句
for (Item item : itemsList) {
//设置sql语句中参数
pstmt.setLong(1,item.getSpu());
pstmt.setLong(2,item.getSku());
pstmt.setString(3,item.getTitle());
pstmt.setDouble(4,item.getPrice());
pstmt.setString(5,item.getPic());
pstmt.setString(6,item.getUrl());
pstmt.setString(7,item.getCreated());
pstmt.setString(8,item.getUpdated());
pstmt.executeUpdate();
}
//释放连接
C3P0Utils.closeAll(conn,pstmt,null);
}
}