网页端不登录可访问的网页,爬虫显示登录页面怎么办?
httpGet.setHeader("User-Agent","值");
- 值可通过F12查看Header获取
Java爬虫——Springboot+JPA 爬取京东产品信息(oracle)
一、添加Maven依赖
<dependencies>
<!-- SpringMVC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>2.6.0</version>
</dependency>
<!-- SpringData jpa -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
<version>2.6.0</version>
</dependency>
<!-- oracle驱动依赖 -->
<dependency>
<groupId>com.oracle</groupId>
<artifactId>ojdbc6</artifactId>
<version>11.2.0.4</version>
</dependency>
<!-- 用于抓取数据 httpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<!-- jsoup解析html -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!--lang库的工具包,例如StringUtils-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.11</version>
</dependency>
<!-- 日志依赖:springboot默认slf4j+logback -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</dependency>
</dependencies>
二、添加application.properties配置文件
# oracle数据库配置
spring.datasource.driver-class-name=oracle.jdbc.OracleDriver
spring.datasource.url=jdbc:oracle:thin:@localhost:1521:ORCL
spring.datasource.username=root
spring.datasource.password=数据库密码
# jpa配置
spring.jpa.database=oracle
spring.jpa.show-sql=true
# 更新或者创建数据表结构
spring.jpa.hibernate.ddl-auto=update
三、代码实现
1. 编写实体类
1.1 JdItem.class
package com.crawler.entities;
import org.hibernate.annotations.Comment;
import javax.persistence.*;
import java.util.Date;
@Entity
@Table(name = "crawler_jd_item")
public class JdItem {
public JdItem() {
}
public JdItem(String title, Double price, String picture, String url, Date createTime, Date updateTime) {
this.title = title;
this.price = price;
this.picture = picture;
this.url = url;
this.createTime = createTime;
this.updateTime = updateTime;
}
@Id
@GeneratedValue(strategy = GenerationType.SEQUENCE, generator="idSeqGenerator")
@SequenceGenerator(name="idSeqGenerator", sequenceName = "jditem_id_seq",
initialValue = 1, allocationSize = 1)
@Comment("主键")
private long id;
@Comment("商品编号")
private long sku;
@Comment("商品标题")
private String title;
@Comment("商品价格")
private Double price;
@Comment("商品图片")
private String picture;
@Comment("商品详细地址")
private String url;
@Comment("创建时间")
private Date createTime;
@Comment("更新时间")
private Date updateTime;
//添加 get/set方法
}
2. 编写dao层
2.1 JdItemDao.java
package com.crawler.daos;
import com.crawler.entities.JdItem;
import org.springframework.data.jpa.repository.JpaRepository;
/**
* 使用的是jpa,需要继承JpaRepository<实体类,实体类主键类型>
*/
public interface JdItemDao extends JpaRepository<JdItem, Long> {
}
3. 编写service层
3.1 service接口 JdItemService.java
package com.crawler.services;
import com.crawler.entities.JdItem;
import java.util.List;
public interface JdItemService {
//保存商品
public void save(JdItem jdItem);
//根据条件查询商品
public List<JdItem> findAll(JdItem jdItem);
}
3.2 service实现类 JdItemServiceImpl.java
package com.crawler.services.impl;
import com.crawler.daos.JdItemDao;
import com.crawler.entities.JdItem;
import com.crawler.services.JdItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
public class JdItemServiceImpl implements JdItemService {
@Autowired
private JdItemDao jdItemDao;
@Override
public void save(JdItem jdItem) {
this.jdItemDao.save(jdItem);
}
@Override
public List<JdItem> findAll(JdItem jdItem) {
//声明查询条件
Example<JdItem> example = Example.of(jdItem);
//根据查询条件进行查询数据
List<JdItem> jdItemList = this.jdItemDao.findAll(example);
return jdItemList;
}
}
4. 封装HttpClient工具类
4.1 HttpUtils.java
package com.crawler.utils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
//创建连接池管理器
private PoolingHttpClientConnectionManager cm;
/**
* 构造方法里初始化连接池管理器
*/
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
//1.1 设置最大连接数
this.cm.setMaxTotal(100);
//1.2 设置每个主机地址(即访问地址)的最大连接数
this.cm.setDefaultMaxPerRoute(10);
}
/**
* 根据请求地址下载页面数据
* @param url
*/
public String doGetHtml(String url) throws IOException {
CloseableHttpResponse response = getResponse(url);
//4. 解析响应,返回结果
if(response.getStatusLine().getStatusCode()==200){
//4.1 判断响应体Entity是否为空,不为空可以使用EntityUtils
if(response.getEntity() != null){
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
//关闭response资源
if(response!=null){
response.close();
}
return "";
}
/**
* 根据地址下载图片
* 分析:https://img12.360buy....462.jpg
* url后缀是jpg表示图片格式的扩展名
* @param imageUrl
* @return
*/
public String doGetImage(String imageUrl) throws IOException {
CloseableHttpResponse response = getResponse(imageUrl);
//4. 解析响应,返回结果
if(response.getStatusLine().getStatusCode()==200){
//4.1 判断响应体Entity是否为空,不为空可以使用EntityUtils
if(response.getEntity() != null){
//4.2 下载图片
//4.2.1 获取图片的后缀(扩展名)
String extName = imageUrl.substring(imageUrl.lastIndexOf("."));
//4.2.2 创建图片名,重命名图片
String picName = UUID.randomUUID().toString()+extName;
//4.2.3 声明OutputStream
OutputStream outputStream = new FileOutputStream(new File("./crawler_jd/images/"+picName));
response.getEntity().writeTo(outputStream);
//4.2.4 返回图片名称
return picName;
}
}
//关闭response资源
if(response!=null){
response.close();
}
return "";
}
//获取response
private CloseableHttpResponse getResponse(String url) throws IOException {
//1. 获取 httpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2. 创建HttpGet请求对象,设置url地址
HttpGet httpGet = new HttpGet(url);
//2.1 设置请求配置信息
httpGet.setConfig(this.getConfig());
//2.2 设置请求头:可能遇上登录页面设置请求Request Headers中的User-Agent,告诉京东是浏览器访问
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36");
//3. 使用HttpClient发起请求,获取响应
CloseableHttpResponse response = httpClient.execute(httpGet);
return response;
}
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000) //创建连接的最长时间
.setConnectionRequestTimeout(500) //获取连接的最长时间
.setSocketTimeout(10*1000) //数据传输的最长时间
.build();
return config;
}
}
5. 实现数据爬虫
5.1 JdItemTask.java
package com.crawler.tasks;
import com.crawler.entities.JdItem;
import com.crawler.services.JdItemService;
import com.crawler.utils.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Date;
import java.util.List;
@Component
public class JdItemTask {
private static final Logger logger = LoggerFactory.getLogger(JdItemTask.class);
@Autowired
private HttpUtils httpUtils;
@Autowired
private JdItemService jdItemService;
/**
* 当下载任务完成后,间隔多长时间进行下一次的任务
*/
@Scheduled(fixedDelay = 100*1000)
public void jdItemTask() throws Exception{
//1. 声明需要解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=7e272b81e0c24f478b9d4721de77b53d&page=";
//2. 按照页码page,对搜索结果进行遍历解析
for(int pageValue=1; pageValue<10; pageValue=pageValue+2){
String content = httpUtils.doGetHtml(url+pageValue);
//3. 解析页面,获取商品数据并存储
this.parseHtml(content);
}
logger.info("京东搜索手机数据抓取测试完成......");
}
/**
* private Date createTime;
* private Date updateTime;
* @param content
* @throws Exception
*/
private void parseHtml(String content) throws Exception{
//3.1 解析html获取Document
Document doc = Jsoup.parse(content);
Elements itemELements = doc.select("div#J_goodsList > ul > li");
for(Element element: itemELements){
logger.info("遍历li下面的元素: "+element.text());
JdItem jdItem = new JdItem();
//3.2 获取sku商品编号
String sku = element.select("[data-sku]").attr("data-sku");
jdItem.setSku(Long.parseLong(sku));
//3.2.1 判断抓取的否已经抓取过存入数据库了
List<JdItem> list = jdItemService.findAll(jdItem);
if(list.size()>0){
continue;
}
//3.2 获取商品标题
Element titleElement = element.select(".p-name>a>em").first();
jdItem.setTitle(titleElement.text());
//3.3 获取商品价格
Element priceElement = element.select(".p-price>strong>i").first();
jdItem.setPrice(Double.valueOf(priceElement.text()));
//3.4 获取商品图片
String picUrl = element.select(".p-img>a>img").attr("data-lazy-img");
jdItem.setPicture(httpUtils.doGetImage("https:"+picUrl));
//3.5 获取商品详细地址url
String url = element.selectFirst(".p-name>a").attr("href");
jdItem.setUrl(url);
jdItem.setCreateTime(new Date());
jdItem.setUpdateTime(jdItem.getCreateTime());
jdItemService.save(jdItem);
}
}
}