依赖:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.jsoup</groupId>-->
<!-- <artifactId>jsoup</artifactId>-->
<!-- <version>1.11.3</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<scope>provided</scope>
</dependency>
实体类: (这个实体类是可以自定义的)
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.List;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class Product {
private Long id;
private Long number;
private String price;
private String url;
private String title;
private String img;
private String dianP;
private String createTime;
private String updateTime;
private List<String> color;
//这里getset和构造方法我用注解了,可以自己换成getset和构造方法
}
dome方法:
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.chenyou.admin.common.reptilestm.Product;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class ProductServiceImp {
private final static String taobaoUrlSign = "taobao.com";
private final static String tmallUrlSign = "tmall.com";
private final static String jingdongUrlSign = "jd.com";
private final static String TMALL_PRODUCT_DETAIL = "https://detail.tmall.com/item.htm?id=";
private final static String TAOBAO_PRODUCT_DETAIL = "https://item.taobao.com/item.htm?id=";
private final static String JD_PRODUCT_DETAIL = "https://item.jd.com/";
public static Product soupTmallDetailByid(String url) {
try {
//需要爬取商品信息的网站地址
//截取id
String id=url.split("&id=")[1];
id=id.split("&")[0];
// 动态模拟请求数据
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
// 模拟浏览器浏览(user-agent的值可以通过浏览器浏览,查看发出请求的头文件获取)
httpGet.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
CloseableHttpResponse response = httpclient.execute(httpGet);
// 获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
HttpEntity entity = response.getEntity();
// 如果状态响应码为200,则获取html实体内容或者json文件
if (statusCode ==200) {
String html = EntityUtils.toString(entity, Consts.UTF_8);
// 提取HTML得到商品信息结果</span>
Document doc = null;
// doc获取整个页面的所有数据</span>
doc = Jsoup.parse(html);
//输出doc可以看到所获取到的页面源代码</span>
//System.out.println(doc);
// 通过浏览器查看商品页面的源代码,找到信息所在的div标签,再对其进行一步一步地解析
Element item = doc.select("div[class='tb-wrap']").get(0);
//Elements liList = ulList.select("div[class='product']");
//循环liList的数据(具体获取的数据值还得看doc的页面源代码来获取,可能稍有变动)</span>
//System.out.println("item = " + item);
Product product = new Product();
//for (Element item : ulList) {
// 商品ID
try {
product.setNumber(Long.valueOf(id));
String title = item.select("div[class='tb-detail-hd']").select("h1").text();
product.setTitle(title);
product.setUrl(url);
//System.out.println("商品title:"+ title);
//颜色
List<String> color=new ArrayList<>();
Element liList = doc.getElementsByClass("tm-img-prop").get(0);
Element liList1 = liList.getElementsByClass("J_TSaleProp").get(0);
Elements liLists = liList1.select("li");
for (Element element:liLists){
String c=element.select("li").attr("title");
color.add(c);
}
product.setColor(color);
//商品图片
Element itemImage = doc.select("div[id='J_DetailMeta']").get(0);
String img=itemImage.select("img[id='J_ImgBooth']").attr("src");
product.setImg(img);
//店铺名称
Element itemDp = doc.select("div[id='headerCon']").get(0);
String dianP = itemDp.select("strong").text();
product.setDianP(dianP);
//价格
try {
//因为商品价格走的是Ajax异步请求,这里只能从底部Ajax里面截取出来
String htmlDoc=doc.toString();
String htmlDoc1=htmlDoc.split("TShop.Setup\\(")[1];
String htmlDoc2=htmlDoc1.split("\\)")[0];
JSONObject stm=JSONObject.parseObject(htmlDoc2);
JSONObject itemDO=stm.getJSONObject("itemDO");
product.setPrice(itemDO.getString("reservePrice"));
}catch (Exception e){
product.setPrice("价格获取失败");
}
return product;
}catch (Exception e) {
product.setId(0L);
product.setTitle("商品不存在");
return product;
}
// }
}
}catch (Exception e){
e.printStackTrace();
}
return null;
}
public static void main(String[] args) {
System.out.println("https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.13.4ba27a9bSAEts0&id=627829903600&skuId=4448961788888&user_id=2997265729&cat_id=2&is_b=1&rn=12f94c881dda696dc3848c212911076a");
}
}