Java爬虫爬取 天猫 淘宝 京东 搜索页和 商品详情
先识别商品url,区分平台提取商品编号,再根据平台带着商品编号爬取数据。
1.导包
<!-- 爬虫相关Jar包依赖 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<scope>provided</scope>
</dependency>
2.封装返回类型和常量
引入lombok 注入@Data 来避免写get set toString等重复代码
package java1024.xyz.vo;
import lombok.Data;
/**
* @author xivin
* @email 1250402127@qq.com
* @description
* @date 2020/1/3
*/
@Data
public class UrlData {
private int status;
private String platform;
private Long number;
}
package java1024.xyz.vo;
/**
* 常量接口
**/
public interface UrlConst {
String taobaoUrlSign = "taobao.com";
String tmallUrlSign = "tmall.com";
String jingdongUrlSign = "jd.com";
String TMALL_PRODUCT_DETAIL = "https://detail.tmall.com/item.htm?id=";
String TAOBAO_PRODUCT_DETAIL = "https://item.taobao.com/item.htm?id=";
String JD_PRODUCT_DETAIL = "https://item.jd.com/";
}
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.Data;
import java.io.Serializable;
import java.sql.Timestamp;
/**
* @author xivin
* @email 1250402127@qq.com
* @description 商品实体类
* @date 2020/1/3
*/
@Data
public class Product implements Serializable {
private Long id;
private Long number;
private Float price;
private Integer userId;
private String url;
private Integer platformId;
private String title;
private String describe;
private Integer status;
@JsonFormat( pattern="yyyy-MM-dd HH:mm:ss")
private Timestamp createdAt;
private Timestamp updatedAt;
}
3.前期工作做好后开始封装 识别url工具 UrlUtils.java
/**
* @author xivin
* @email 1250402127@qq.com
* @description
* @date 2020/1/3
*/
public class UrlUtils {
public static UrlData analyseUrl(String url) {
UrlData urlData = new UrlData();
try {
// 判空
if (StringUtils.isEmpty(url)) {
urlData.setStatus(0);
return urlData;
}
//天猫
if (url.contains(UrlConst.tmallUrlSign)) {
urlData.setPlatform(UrlConst.tmallUrlSign);
String numberStr = "";
/**
* 切分根路径 和 参数 如:
* https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.8.27832a99AfoD5W&id=604433373792
* 在 ?问号的地方切成两部分
*
*/
String[] roudAndParams = url.split("\\?");
if (roudAndParams.length < 2) {
urlData.setStatus(0);
return urlData;
}
/**
* 获取 参数字符串,