java爬虫入门案例
本博客只供学习,搜索手机。
1.sql脚本
DROP TABLE IF EXISTS `jb_item`;
CREATE TABLE `jb_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` bigint(15) NULL DEFAULT NULL COMMENT '商品集合id',
`sku` bigint(15) NULL DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品标题',
`price` bigint(10) NULL DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品详情地址',
`created` datetime(0) NULL DEFAULT NULL COMMENT '创建时间',
`updated` datetime(0) NULL DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`) USING BTREE,
INDEX `sku`(`sku`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '京东商品表' ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
2.pom依赖
<!-- SpringMVC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- SpringData jpa -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.11</version>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!-- jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!-- 工具包 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
3.application.properties
spring.datasource.url=jdbc:mysql://localhost:3306/dailytest?useUnicode=true&characterEncoding=UTF-8&useSSL=false&serverTimezone=Asia/Shanghai&zeroDateTimeBehavior=CONVERT_TO_NULL&allowPublicKeyRetrieval=true
spring.datasource.driverClassName=com.mysql.cj.jdbc.Driver
spring.datasource.username=root
spring.datasource.password=123456
#jpa
spring.jpa.datasource=MySQL
spring.jpa.show=true
4.pojo
@Entity
@Table(name = "jd_item")
public class Item {
//主键
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
//标准商品单位(商品集合)
private Long spu;
//库存量单位(最小品类单元)
private Long sku;
//商品标题
private String title;
//商品价格
private Double price;
//商品图片
private String pic;
//商品详情地址
private String url;
//创建时间
private Date created;
//更新时间
private Date updated;
//get和set方法
}
5.dao
public interface ItemDao extends JpaRepository<Item, Long> {
}
6.service
public interface ItemService {
//保存商品
public void save(Item item);
//根据条件查询商品
public List<Item> findAll(Item item);
}
7.service实现
@Service
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Override
public void save(Item item) {
itemDao.save(item);
}
@Override
public List<Item> findAll(Item item) {
//声明查询条件
Example<Item> example = Example.of(item);
//根据查询条件进行查询数据
List<Item> list = itemDao.findAll(example);
return list;
}
}
8.启动类
@SpringBootApplication
//使用定时任务,添加注解
@EnableScheduling
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
9.封装Httpclient
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager poolingHttpClientConnectionManager;
public HttpUtils() {
this.poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager();
//设置最大连接数
poolingHttpClientConnectionManager.setMaxTotal(100);
poolingHttpClientConnectionManager.setDefaultMaxPerRoute(10);
}
//根据请求地址下载页面
public String doGetHtml(String url){
//获取httpclient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
HttpGet httpGet = new HttpGet(url);
httpGet = setJdHeader(httpGet);
//设置请求信息
httpGet.setConfig(getConfig());
CloseableHttpResponse closeableHttpResponse = null;
try {
closeableHttpResponse = httpClient.execute(httpGet);
//解析
if (closeableHttpResponse.getStatusLine().getStatusCode() == 200){
//判断响应体是否不为空
if (closeableHttpResponse.getEntity() != null){
String content = EntityUtils.toString(closeableHttpResponse.getEntity(), "utf8");
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if (closeableHttpResponse != null){
try {
closeableHttpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
//下载图片,返回图片名称
public String doGetImage(String url){
//获取httpclient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
HttpGet httpGet = new HttpGet(url);
httpGet = setJdHeader(httpGet);
//设置请求信息
httpGet.setConfig(getConfig());
CloseableHttpResponse closeableHttpResponse = null;
try {
closeableHttpResponse = httpClient.execute(httpGet);
//解析
if (closeableHttpResponse.getStatusLine().getStatusCode() == 200){
//判断响应体是否不为空
if (closeableHttpResponse.getEntity() != null){
//下载图片
//获取图片的后缀
String ext = url.substring(url.lastIndexOf("."));
//创建图片名,重命名图片
String picName = UUID.randomUUID().toString()+ext;
//下载图片
//声明outputstream
File file;
OutputStream outputStream = new FileOutputStream(new File("E:\\学习\\文件夹\\自学系列\\spider\\OutPutPic\\"+picName));
closeableHttpResponse.getEntity().writeTo(outputStream);
//返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if (closeableHttpResponse != null){
try {
closeableHttpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
private RequestConfig getConfig(){
RequestConfig config = RequestConfig.custom()
.setConnectionRequestTimeout(500) //获取连接的最长时间
.setConnectTimeout(1000) //创建连接的最长时间
.setSocketTimeout(10000).build();
return config;
}
//设置请求头
private HttpGet setJdHeader(HttpGet httpGet){
httpGet.setHeader(":authority", "search.jd.com");
httpGet.setHeader(":method", "GET");
httpGet.setHeader(":path", "s/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=14790f8e8aaa4ce198d044bff7af701e");
httpGet.setHeader(":scheme", "https");
httpGet.setHeader("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpGet.setHeader("accept-encoding", "gzip, deflate, br");
httpGet.setHeader("accept-language","zh-CN,zh;q=0.9");
httpGet.setHeader("cache-control","no-cache");
httpGet.setHeader("pragma","no-cache");
httpGet.setHeader("referer","https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&suggest=1.his.0.0&wq=&pvid=d1daa37ee21e48a187842cccb7432d30");
httpGet.setHeader("sec-fetch-dest","document");
httpGet.setHeader("sec-fetch-mode","navigate");
httpGet.setHeader("sec-fetch-site","same-origin");
httpGet.setHeader("sec-fetch-user","?1");
httpGet.setHeader("upgrade-insecure-requests","1");
httpGet.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36");
return httpGet;
}
}
10.定时任务类
@Component
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
private static final ObjectMapper objectMapper = new ObjectMapper();
//下载任务完成后,间隔多长时间进行下一次任务(单位:毫秒)
@Scheduled(fixedDelay = 100*1000)
public void itemTask()throws Exception{
//声明需要解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&s=61&click=0&page=";
//按照页码对手机搜索结果进行解析
for (int i=1; i<2; i+=2){
String html = httpUtils.doGetHtml(url + i);
parse(html);
}
System.out.println("手机数据抓取完成!");
}
private void parse(String html) throws JsonProcessingException {
Document document = Jsoup.parse(html);
Elements spuEles = document.select("div#J_goodsList > ul > li");
for (Element spuEle:
spuEles) {
//获取spu
long spu = Long.parseLong(spuEle.attr("data-spu"));
//获取sku
Elements skuEles = spuEle.select("li.ps-item");
for (Element skuEle :
skuEles) {
long sku = Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));
//根据sku查询商品数据
Item item = new Item();
item.setSku(sku);
List<Item> list = itemService.findAll(item);
if (list.size()>0){
continue;
}
item.setSpu(spu);
//获取商品详情url
String itemUrl = "https://item.jd.com/" + sku +".html";
item.setUrl(itemUrl);
String src = "https:"+ skuEle.select("img[data-sku]").first().attr("data-lazy-img");
src = src.replace("/n9/", "/n1/");
String image = httpUtils.doGetImage(src);
item.setPic(src);
//获取商品价格
String priceHtml = httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
double price = objectMapper.readTree(priceHtml).get(0).get("p").asDouble();
item.setPrice(price);
//获取标题
String itemInfo = httpUtils.doGetHtml(item.getUrl());
String text = Jsoup.parse(itemInfo).select("div.sku-name").text();
item.setTitle(text);
item.setCreated(new Date());
item.setUpdated(item.getCreated());
//保存到数据库
itemService.save(item);
}
}
}
}
11.结果