页面爬取

package com.shengdun.demo.controller;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

public class test {
    public static void main(String[] args) throws Exception{

        CloseableHttpClient httpClient= HttpClients.createDefault();  //创建HttpClient实例
        HttpGet httpGet=new HttpGet("https://www.amazon.cn/dp/B075LGPY95/ref=lp_665002051_1_2?s=wireless&ie=UTF8&qid=1547102558&sr=1-2");
        CloseableHttpResponse response=httpClient.execute(httpGet);//执行get请求
        HttpEntity entity=response.getEntity();//获取返回实体
        String webContent= EntityUtils.toString(entity, "utf-8");
        System.out.println("网页内容:"+webContent);  //指定编码打印网页内容
        response.close();  //关闭和释放系统资源

        Document document= Jsoup.parse(webContent);
        String elements = document.getElementsByAttributeValue("id","imgBlkFront").attr("data-a-dynamic-image");
        String landingImage = document.getElementsByAttributeValue("id","landingImage").attr("data-a-dynamic-image");
        String e =  elements.replaceAll("\\:\\[[0-9]{3}\\,[0-9]{3}\\]","");
        //获取图片列表
        List list = new ArrayList();
        //todo ceshi
        Elements imgs = document.getElementsByTag("img");
        //颜色
        Elements colorNames = document.getElementsByAttributeValueMatching("id","color_name_[0-9].*?");
        Elements colorName1;
        for(Element colorName :  colorNames){
            colorName1  = colorName.getElementsByTag("img");
            colorName1.attr("src");
            colorName1.attr("alt");
            System.out.println(colorName1);
            System.out.println(colorName1.attr("src"));
            System.out.println(colorName1.attr("alt"));
        }
        //尺寸
        Elements sizeNames = document.getElementsByAttributeValueMatching("id","size_name_[0-9].*?");
        for(Element sizeName :  sizeNames){
            System.out.println(sizeName.attr("title"));
            System.out.println(sizeName.text());
        }

        //获取标题
        String productTitle = document.getElementsByAttributeValue("id","productTitle").text();
        //获取价格
        //String price = document.getElementsByAttributeValue("class","a-size-medium a-color-price").text();
        //String price3 = document.getElementsByAttributeValue("id","priceblock_ourprice").text();
        //String price2 = document.getElementsByAttributeValue("id","priceblock_saleprice").text();
        Elements element = document.getElementsByAttributeValueMatching("id","priceblock_.*?price");
        String price ="";
        for(Element ele: element){
            if("span".equals(ele.tagName())){
                price = ele.text();
            }
        }
        System.out.println(price);

        //获取页面内容
        Elements pageContent = document.getElementsByAttributeValue("id","dp-container");
        //商品描述
        Elements productDescription = document.getElementsByAttributeValue("id","productDescription");

        System.out.println(list);
    }

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值