java 爬虫 (一) 获取商品详情页商品信息以及价格

72 篇文章 0 订阅

依赖:

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.10-FINAL</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
<!--        <dependency>-->
<!--            <groupId>org.jsoup</groupId>-->
<!--            <artifactId>jsoup</artifactId>-->
<!--            <version>1.11.3</version>-->
<!--        </dependency>-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <scope>provided</scope>
        </dependency>

实体类:  (这个实体类是可以自定义的)



import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.util.List;

@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class Product {

    private Long id;
    private Long number;
    private String price;
    private String url;
    private String title;
    private String img;
    private String dianP;
    private String createTime;
    private String updateTime;
    private List<String> color;

    //这里getset和构造方法我用注解了,可以自己换成getset和构造方法
}

dome方法:



import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.chenyou.admin.common.reptilestm.Product;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

public class ProductServiceImp {

    private final static  String taobaoUrlSign = "taobao.com";
    private final static  String tmallUrlSign = "tmall.com";
    private final static   String jingdongUrlSign = "jd.com";

    private final static  String TMALL_PRODUCT_DETAIL = "https://detail.tmall.com/item.htm?id=";
    private final static String TAOBAO_PRODUCT_DETAIL = "https://item.taobao.com/item.htm?id=";
    private final static  String JD_PRODUCT_DETAIL = "https://item.jd.com/";

       public static Product soupTmallDetailByid(String url)  {
           try {
               //需要爬取商品信息的网站地址
               //截取id
               String id=url.split("&id=")[1];
               id=id.split("&")[0];
               // 动态模拟请求数据
              CloseableHttpClient httpclient = HttpClients.createDefault();
              HttpGet httpGet = new HttpGet(url);
              // 模拟浏览器浏览(user-agent的值可以通过浏览器浏览,查看发出请求的头文件获取)
               httpGet.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
               CloseableHttpResponse response = httpclient.execute(httpGet);
               // 获取响应状态码
               int statusCode = response.getStatusLine().getStatusCode();
                   HttpEntity entity = response.getEntity();
                   // 如果状态响应码为200,则获取html实体内容或者json文件
                  if (statusCode ==200) {
                      String html = EntityUtils.toString(entity, Consts.UTF_8);
                      // 提取HTML得到商品信息结果</span>
                      Document doc = null;
                      // doc获取整个页面的所有数据</span>
                      doc = Jsoup.parse(html);
                      //输出doc可以看到所获取到的页面源代码</span>
                      //System.out.println(doc);
                      // 通过浏览器查看商品页面的源代码,找到信息所在的div标签,再对其进行一步一步地解析
                      Element item = doc.select("div[class='tb-wrap']").get(0);
                      //Elements liList = ulList.select("div[class='product']");
                      //循环liList的数据(具体获取的数据值还得看doc的页面源代码来获取,可能稍有变动)</span>
                      //System.out.println("item = " + item);
                      Product product = new Product();
                      //for (Element item : ulList) {
                     // 商品ID
                     try {
                        product.setNumber(Long.valueOf(id));
                        String title = item.select("div[class='tb-detail-hd']").select("h1").text();
                        product.setTitle(title);
                        product.setUrl(url);
                         //System.out.println("商品title:"+ title);
                         //颜色
                         List<String> color=new ArrayList<>();
                         Element liList = doc.getElementsByClass("tm-img-prop").get(0);
                         Element liList1 = liList.getElementsByClass("J_TSaleProp").get(0);
                         Elements liLists = liList1.select("li");
                         for (Element element:liLists){
                             String c=element.select("li").attr("title");
                             color.add(c);
                         }
                         product.setColor(color);
                         //商品图片
                         Element itemImage = doc.select("div[id='J_DetailMeta']").get(0);
                         String  img=itemImage.select("img[id='J_ImgBooth']").attr("src");
                         product.setImg(img);
                         //店铺名称
                         Element itemDp = doc.select("div[id='headerCon']").get(0);
                         String dianP = itemDp.select("strong").text();
                         product.setDianP(dianP);
                         //价格
                         try {
                             //因为商品价格走的是Ajax异步请求,这里只能从底部Ajax里面截取出来
                             String htmlDoc=doc.toString();
                             String htmlDoc1=htmlDoc.split("TShop.Setup\\(")[1];
                             String htmlDoc2=htmlDoc1.split("\\)")[0];
                             JSONObject stm=JSONObject.parseObject(htmlDoc2);
                             JSONObject itemDO=stm.getJSONObject("itemDO");
                             product.setPrice(itemDO.getString("reservePrice"));
                         }catch (Exception e){
                             product.setPrice("价格获取失败");
                         }
                        return product;
                }catch (Exception e) {
                          product.setId(0L);
                          product.setTitle("商品不存在");
                    return product;
                      }
                // }
                  }
               }catch (Exception e){
                   e.printStackTrace();
               }
           return null;
       }

    public static void main(String[] args) {
        System.out.println("https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.13.4ba27a9bSAEts0&id=627829903600&skuId=4448961788888&user_id=2997265729&cat_id=2&is_b=1&rn=12f94c881dda696dc3848c212911076a");
    }
}

 

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值