📫 作者简介:「子非我鱼」,专注于研究全栈
🔥 三连支持:欢迎 ❤️关注、👍点赞、👉收藏三连,支持一下博主~
文章目录
步骤一:导入数据库
// 创建crawler数据库, 再创建表
CREATE TABLE `jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(100) DEFAULT NULL COMMENT '商品标题',
`price` bigint(10) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`),
KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表';
步骤二:导入所需依赖
<dependencies>
<!--SpringMVC-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--MySQL连接包-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.11</version>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- 通用Mapper启动器 -->
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper-spring-boot-starter</artifactId>
<version>2.0.2</version>
</dependency>
<!--工具包-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
</dependencies>
步骤三:配置application.yml文件
server:
port: 81
spring:
datasource:
driver-class-name: com.mysql.jdbc.Driver
url: jdbc:mysql://localhost:3306/myqxin?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC
username: root
password: 9527
步骤四:创建pojo实体类
@Table(name = "jd_item")
@Data
public class Item {
//主键
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
//标准产品单位(商品集合)
private Long spu;
//库存量单位(最小品类单元)
private Long sku;
//商品标题
private String title;
//商品价格
private Double price;
//商品图片
private String pic;
//商品详情地址
private String url;
//创建时间
private Date created;
//更新时间
private Date updated;
}
步骤五:创建Mapper和Service
Mapper
/**
* @author: myqxin
* @Desc:
* @create: 2021-06-25 14:24
**/
@org.apache.ibatis.annotations.Mapper
public interface ItemMapper extends Mapper<Item> {
}
Service
@Service
public class ItemServiceImpl implements ItemService {
}
步骤六:创建启动类
@SpringBootApplication
@EnableScheduling // 开启定时任务注解
public class MySpringBootStarter {
public static void main(String[] args) {
SpringApplication.run(MySpringBootStarter.class,args);
}
}
步骤七:创建HttpUtils工具类
package com.czxy.music.web.config;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
/**
* @author: myqxin
* @Desc:
* @create: 2021-06-25 14:39
**/
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager pm;
public HttpUtils() {
this.pm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
this.pm.setMaxTotal(100);
// 设置每个主机最大连接数
this.pm.setDefaultMaxPerRoute(10);
}
/**
* 根据请求地址下载页面数据
*
* @param url 网址
* @return 页面数据
*/
public String doGetHtml(String url) {
// 获取 httpClient 对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.pm).build();
// 创建 httpGet 请求对象,设置url地址
HttpGet httpGet = new HttpGet(url);
// 设置请求信息
httpGet.setConfig(getConfig());
// 设置请求头,伪装用户
setHeaders(httpGet);
CloseableHttpResponse response = null;
try {
// 使用HttpClient发起请求,获取响应
response = httpClient.execute(httpGet);
// 解析响应,返回结果
if (response.getStatusLine().getStatusCode() == 200) {
// 判断响应体Entity是否为空,如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
// 返回图片名称
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 下载失败,返回空字符串
return "";
}
/**
* 下载图片
*
* @param url 地址
* @return 图片名称
*/
public String doGetImage(String url) {
// 获取 httpClient 对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.pm).build();
// 创建 httpGet 请求对象,设置url地址
HttpGet httpGet = new HttpGet(url);
// 设置请求信息
httpGet.setConfig(getConfig());
// 设置请求头,伪装用户
setHeaders(httpGet);
CloseableHttpResponse response = null;
try {
// 使用HttpClient发起请求,获取响应
response = httpClient.execute(httpGet);
// 解析响应,返回结果
if (response.getStatusLine().getStatusCode() == 200) {
// 判断响应体Entity是否为空,如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
// 下载图片
// 获取图片的后缀
String extName = url.substring(url.lastIndexOf("."));
// 创建图片名,重命名图片
String picName = UUID.randomUUID().toString() + extName;
// 声明 OutPutStream
FileOutputStream outputStream = new FileOutputStream(new File("C:\\Users\\myqxin\\Desktop\\Jdimages\\"+picName));
response.getEntity().writeTo(outputStream);
// 返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 下载失败,返回空字符串
return "";
}
/**
* 设置请求信息
*
* @return
*/
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000) // 创建连接的最长时间,单位:毫秒
.setConnectionRequestTimeout(500) // 获取连接的最长时间,单位:毫秒
.setSocketTimeout(10000) // 数据传输的最长时间,单位:毫秒
.build();
return config;
}
/**
* 设置请求头
*
* @param httpGet
*/
private void setHeaders(HttpGet httpGet) {
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
}
}
注意: 使用HttpClient爬取数据时, 为了防止被网站拦截, 应该设置请求
步骤八:实现爬取京东手机信息数据(https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=cab755112a5e463e9ec1c356ac31fd1c&s=56&click=0&page=1)
package com.czxy.music.web.controller;
import com.czxy.music.pojo.Item;
import com.czxy.music.service.ItemService;
import com.czxy.music.web.config.HttpUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.Date;
import java.util.List;
/**
* @author: myqxin
* @Desc:
* @create: 2021-06-25 15:28
**/
@Component
public class ItemTask {
@Resource
private HttpUtils httpUtils;
@Resource
private ItemService itemService;
private static final ObjectMapper MAPPER = new ObjectMapper();
@Scheduled(fixedDelay = 100 * 1000)
public void itemTask() {
// 声明需要解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=cab755112a5e463e9ec1c356ac31fd1c&s=56&click=0&page=";
for (int i = 1; i < 5; i = i + 2) {
String html = httpUtils.doGetHtml(url + i);
// 解析页面,获取商品数据并存储
parse(html);
}
System.out.println("手机页面数据抓取完成");
}
private void parse(String html) {
// 解析页面,获取Document对象
Document doc = Jsoup.parse(html);
// 获取 spu 信息
Elements spuEles = doc.select("div#J_goodsList > ul > li");
for (Element spuEle : spuEles) {
// 排除没有data-spu的值
if (StringUtils.isNoneBlank(spuEle.attr("data-spu"))) {
// 获取 spu
long spu = Long.parseLong(spuEle.attr("data-spu"));
// 获取 sku 信息
Elements skuEles = spuEle.select("li.ps-item");
for (Element skuEle : skuEles) {
// 获取 sku
long sku = Long.parseLong(skuEle.select("[data-sku]").first().attr("data-sku"));
// 根据sku查询商品数据
Item item = new Item();
item.setSku(sku);
List<Item> list = itemService.findAll(item);
if (list.size()>0){
// 如果商品存在,就进行下一次循环,该商品不保存
continue;
}
// 设置商品的spu
item.setSpu(spu);
// 获取商品的详情的url
String itemUrl = "https://item.jd.com/" + sku + ".html";
item.setUrl(itemUrl);
// 获取商品的图片
String picUrl = "https:" + skuEle.select("img[data-sku]").first().attr("data-lazy-img");
String picName = httpUtils.doGetImage(picUrl);
item.setPic(picName);
// 获取商品的价格
String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
double price = 0;
try {
price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
} catch (JsonProcessingException e) {
e.printStackTrace();
}
item.setPrice(price);
// 获取商品的标题
String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
item.setTitle(title);
//item.setTitle();
item.setCreated(new Date());
item.setUpdated(item.getCreated());
// 保存商品数据到数据库中
System.err.println(item);
}
}
}
}
}
ItemService里面有两个方法,一个是查询,一个保存,我这里只是模拟,没有进行实现,你们根据实际业务情况进行实现