需求分析:
首先访问京东,搜索手机,分析页面,我们抓取以下商品数据:
商品图片、价格、标题、商品详情页
一.开发准备
技术要求:springboot spring data jpa httpclient jsoup
数据库准备:
use crawler;
CREATE TABLE `jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键 id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合 id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元 id',
`title` varchar(100) DEFAULT NULL COMMENT '商品标题',
`price` bigint(10) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`),
KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表'
1.创建springboot 工程 ,导入依赖
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.1.RELEASE</version>
</parent>
<groupId>com.itheima</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- 日志 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!--工具-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
</dependencies>
2.编写配置文件
3.编写启动类
@SpringBootApplication
public class SpringBootRun {
public static void main(String[] args) {
SpringApplication.run(SpringBootRun.class,args);
}
}
4.编写持久化类
@Entity
@Table(name = "jd_item")
public class Item {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private String spu;
private String sku;
private String title;
private Long price;
private String pic;
private String url;
private Date created;
private Date updated;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getSpu() {
return spu;
}
public void setSpu(String spu) {
this.spu = spu;
}
public String getSku() {
return sku;
}
public void setSku(String sku) {
this.sku = sku;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Long getPrice() {
return price;
}
public void setPrice(Long price) {
this.price = price;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Date getCreated() {
return created;
}
public void setCreated(Date created) {
this.created = created;
}
public Date getUpdated() {
return updated;
}
public void setUpdated(Date updated) {
this.updated = updated;
}
}
5.编写dao接口
public interface ItemDao extends JpaRepository<Item,Long>{
}
6.编写service接口以及实现列
@Service
@Transactional
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Override
public void save(Item item) {
itemDao.save(item);
}
//条件查询
@Override
public List<Item> findAll(Item item) {
Example example = Example.of(item);
List<Item> all = this.itemDao.findAll(example);
return all;
}
}
二.代码实现
封装HttpClient
package com.itheima.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.context.annotation.ComponentScan;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
@ComponentScan
public class HttpUtil {
//http连接池
private static PoolingHttpClientConnectionManager pool;
static{
pool = new PoolingHttpClientConnectionManager();
pool.setMaxTotal(100);
pool.setDefaultMaxPerRoute(50);
}
/**
* 获取页面源码
*/
public String getHtml(String url){
CloseableHttpClient build = HttpClients.custom().setConnectionManager(pool).build();
//通过get请求
HttpGet httpGet = new HttpGet();
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36");
httpGet.setConfig(this.getConfig());
//发送请求
try {
CloseableHttpResponse response = build.execute(httpGet);
//判断发送的返回的状态
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(), "UTF-8");
return string;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 下载图片
*/
public String getImage(String imgUrl){
CloseableHttpClient build = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet();
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = build.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
//后缀名 .jpg .png
String suffix = imgUrl.substring(imgUrl.lastIndexOf("."));
String newImg = UUID.randomUUID()+suffix;
//保存图片
FileOutputStream fileOutputStream = new FileOutputStream(new File("F:\\img\\" + newImg));
response.getEntity().writeTo(fileOutputStream);
return newImg;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
private RequestConfig getConfig(){
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(500)
.setConnectTimeout(500)
.setSocketTimeout(1000 * 10)
.build();
return requestConfig;
}
}
使用定时任务编写页面抓取代码
package com.itheima.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.itheima.po.Item;
import com.itheima.service.ItemService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.io.IOException;
import java.util.Date;
@Component
public class ItemTask {
@Autowired
private ItemService itemService;
@Autowired
private HttpUtil httpUtil;
public static final ObjectMapper MAPPER = new ObjectMapper();
//设置定时任务,间隔100秒执行一次
@Scheduled(fixedDelay = 1000 * 50)
public void process(){
//京东的url 地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=59&click=0&page=";
for (int i = 1; i < 10; i=i+2) {
String html = this.httpUtil.getHtml(url+i);
//解析页面数据保存到数据库
this.parseHtml(html);
}
System.out.println("执行完成");
}
//解析页面数据库保存到数据库
private void parseHtml(String html) {
//通过Jsoup解析文件
Document document = Jsoup.parse(html);
//获取spu 的dom 数据
Elements spuelements = document.select("div#J_goodsList li.gl-item");
for (Element spuelement : spuelements) {
String spuId = spuelement.attr("data-spu");
Elements skuelements = spuelement.select("div.p-scroll li.ps-item");
for (Element skuelement : skuelements) {
Item item = new Item();
item.setSpu(spuId);
//获得skuid
String skuid = skuelement.select("img").attr("data-sku");
item.setSku(skuid);
//获得图片的url路径
String skuUrl = "https://item.jd.com/"+skuid+".html";
item.setUrl(skuUrl);
//发送请求获得商品详情页的数据
String skuHtml = httpUtil.getHtml(skuUrl);
//获得详情页的dom树
Document skuDocumet = Jsoup.parse(skuHtml);
String skuTitle = skuDocumet.select("div.sku-name").text();
item.setTitle(skuTitle);
//注意:因为价格是异步请求的,所以我们通过ajax获得
String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+skuid;
String parceHtml = httpUtil.getHtml(priceUrl);
ObjectMapper objectMapper = new ObjectMapper();
try {
long price = objectMapper.readTree(priceUrl).get(0).get("p").asLong();
item.setPrice(price);
} catch (IOException e) {
e.printStackTrace();
}
//获取路径
String imgUrl = skuelement.select("img").attr("src");
if (StringUtils.isEmpty(imgUrl)){
imgUrl = skuelement.select("img").attr("data-lazy-img");
}
imgUrl = imgUrl.replace("/n9/","/n7/");
String imageNewName= httpUtil.getImg("http:" + imgUrl);
item.setPic(imageNewName);
item.setCreated(new Date());
itemService.save(item);
}
}
}
}
使用json解析获得的页面
package com.itheima.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
@Component
public class HttpUtil {
private static PoolingHttpClientConnectionManager pool;
static{
pool = new PoolingHttpClientConnectionManager();
pool.setMaxTotal(200);
pool.setDefaultMaxPerRoute(50);
}
/**
* 使用httpclient抓取页面
* @param url
* @return
*/
public String getHtml(String url){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36");
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
return content;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 获取图片,并且保存在本地文件
* 返回图片的新文件名
* @param imgUrl
* @return
*/
public String getImg(String imgUrl){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet(imgUrl);
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = httpClient.execute(httpGet);
//获取图片后缀 .jpg .png
String suffix = imgUrl.substring(imgUrl.lastIndexOf("."));
//创建新文件名
String imgNewName = UUID.randomUUID()+suffix;
if(response.getStatusLine().getStatusCode() ==200){
//通过流保存图片文件
FileOutputStream outputStream = new FileOutputStream("F:\\img\\"+imgNewName);
response.getEntity().writeTo(outputStream);
return imgNewName;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 生成配置
* @return
*/
private RequestConfig getConfig(){
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(1000)
.setConnectTimeout(1000)
.setSocketTimeout(1000 * 10)
.build();
return requestConfig;
}
}