java 爬虫

前言:
为什么我们要爬取数据

在大数据时代,我们要获取更多数据,就要进行数据的挖掘、分析、筛选,比如当我们做一个项目的时候,需要大量真实的数据的时候,就需要去某些网站进行爬取,有些网站的数据爬取后保存到数据库还不能够直接使用,需要进行清洗、过滤后才能使用,我们知道有些数据是非常真贵的。
一: 引入pom文件依赖

<dependency>
				<groupId>org.apache.httpcomponents</groupId>
				<artifactId>httpclient</artifactId>
				<version>4.5.2</version>
</dependency>

二:新建一个测试类

public class Demo {
    public static void main(String[] args)throws Exception {
        //1 创建httpClient对象
        CloseableHttpClient http = HttpClients.createDefault();
        //输入网址发起请求
        HttpGet httpGet =new HttpGet("https://www.baidu.com");
        //3 返回相应
        CloseableHttpResponse response = http.execute(httpGet);
        //4 解析数据
        if(response.getStatusLine().getStatusCode()==200){
            String s = EntityUtils.toString(response.getEntity(), "utf-8");
            System.out.println(s);
        }
        response.close();
        http.close();

    }

}

三 查看控制台
在这里插入图片描述
在这里插入图片描述
上面我们讲的是get无参数请求 ,像这样带参数的请求又是什么样呢 如下所示:

public static void main(String[] args)throws Exception {
        //1 创建httpClient对象
        CloseableHttpClient http = HttpClients.createDefault();
        //输入网址发起请求
        URIBuilder uriBuilder =new URIBuilder("https://www.baidu.com");
        // 如果存在一个参数如下所示 多个参数继续setParameter
        // uriBuilder.setParameter("key","xx").setParameter("key","xx")
             uriBuilder.setParameter("key","xx");
        HttpGet httpGet =new HttpGet(uriBuilder.build());
        //3 返回相应
        CloseableHttpResponse response = http.execute(httpGet);
        //4 解析数据
        if(response.getStatusLine().getStatusCode()==200){
            String s = EntityUtils.toString(response.getEntity(), "utf-8");
            System.out.println(s);
        }
        response.close();
        http.close();

    }

post无参数请求和get无参数请求是类似的 这里我就不写了 要改的地方就是HttpGet 换成HttpPost
接下来我们看下post带参数请求 如下:

public static void main(String[] args)throws Exception {
        //1 创建httpClient对象
        CloseableHttpClient http = HttpClients.createDefault();
        //输入网址发起post请求 创建list集合 添加参数
        List<NameValuePair> list =new ArrayList<NameValuePair>();
                    list.add(new BasicNameValuePair("key","value"));

        UrlEncodedFormEntity urlEncodedFormEntity =new UrlEncodedFormEntity(list,"utf-8");
          //添加到Http当中
        HttpPost httpPost =new HttpPost("https://www.baidu.com");
        httpPost.setEntity(urlEncodedFormEntity);
        CloseableHttpResponse execute = http.execute(httpPost);
        if(execute.getStatusLine().getStatusCode()==200){
            String s = EntityUtils.toString(execute.getEntity(), "utf-8");
            System.out.println(s);
        }
        execute.close();
        http.close();

    }

为了解析数据我们引入了Jsoup 如下依赖

 <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
</dependency>
 
 <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
 </dependency>

新建一个工具类HttpUtils


package com.server.cloud.utils;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

@Component
public class HttpUtils {
 private PoolingHttpClientConnectionManager cm;
     //连接池
    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        this.cm.setMaxTotal(10000);
        this.cm.setDefaultMaxPerRoute(10000);
    }


    /**
     *
     * @param url get请求加载地址
     * @return
     */
    public String doget(String url){
         //获取HttpClient对象
        CloseableHttpClient httpClient =HttpClients.custom().setConnectionManager(this.cm).build();
        //创建httpClient对象 设置地址
        HttpGet httpGet = new HttpGet(url);
        //设置请求信息
        httpGet.setConfig(this.getConfig());
         //设置变量
        CloseableHttpResponse respones=null;
        //发起请求 获取响应
        try {
            respones = httpClient.execute(httpGet);
            //解析响应
            if(respones.getStatusLine().getStatusCode()==200){
               //判断响应体是否不为空
                if(respones.getEntity()!=null){
                    String s = EntityUtils.toString(respones.getEntity(), "utf-8");
                    return s;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(respones.getEntity()!=null){
                try {
                    respones.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
   return "";
   }



    /**w
     *
     * @param url
     * @return  图片名称
     */
   public String doGetImage(String url){
       //获取HttpClient对象
       CloseableHttpClient httpClient =HttpClients.custom().setConnectionManager(this.cm).build();
       //创建httpClient对象 设置地址
       HttpGet httpGet = new HttpGet(url);
       //设置请求信息
       httpGet.setConfig(this.getConfig());
       //设置变量
       CloseableHttpResponse respones=null;
       //发起请求 获取响应
       try {
           respones = httpClient.execute(httpGet);
           //解析响应
           if(respones.getStatusLine().getStatusCode()==200){
               //判断响应体是否不为空
               if(respones.getEntity()!=null){
               //下载图片
                   //获取图片后缀
            String extName = url.substring(url.lastIndexOf(","));
                   //创建图片名
             String picName = UUID.randomUUID().toString()+extName;
                   //下载图片
                   OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\image4"+picName));
               respones.getEntity().writeTo(outputStream);
                   //返回图片名称
                   return picName;

               }
           }
       } catch (IOException e) {
           e.printStackTrace();
       }finally {
           if(respones.getEntity()!=null){
               try {
                   respones.close();
               } catch (IOException e) {
                   e.printStackTrace();
               }
           }
       }
       return "";


   }


    private RequestConfig getConfig() {
       RequestConfig config = RequestConfig.custom()
               .setConnectTimeout(1000)
               .setConnectionRequestTimeout(500)
               .setSocketTimeout(1000)
               .build();
       return config;
    }

}

新建task类 这里我们是定时跑的 所以叫Task

package com.server.cloud.Task;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.server.cloud.enetity.Item;
import com.server.cloud.service.ItemService;
import com.server.cloud.utils.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.util.List;


@Component
public class TestTask {
    private static  final ObjectMapper MAPPER = new ObjectMapper();
     @Autowired
   private HttpUtils httpUtils;
     @Autowired
     private ItemService itemService;

    @Scheduled(cron = "0 */2 * * * ?")
public void itemTask()throws Exception{
  String url="https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=shouj&ev=exbrand_%E5%8D%8E%E4%B8%BA%EF%BC%88HUAWEI%EF%BC%89%5E&page=1&s=1&click=0";

    for (int i = 1; i < 10; i=i++) {
        String html = httpUtils.doget(url);
       //解析页面
        this.parse(html);
    }
}

    private void parse(String html)throws Exception {
      //解析HTML 获取对象
        Document parse = Jsoup.parse(html);

        Elements select = parse.select("div#J_goodsList > ul > li");

        for (Element element : select) {
            String spu = element.attr("data-spu");
           //获取sku

            Elements sku = element.select("li.ps-item");
            for (Element skk : sku) {
                String att = skk.select("[data-sku]").attr("data-sku");

                //查询商品数据
                Item item =new Item();
                     item.setSku(att);
                List<Item> items = itemService.selectAll(att);
                if(items.size()>0){
                    continue;
                }
                //设置spu
                item.setSpu(spu);
                String itemUrl ="https://item.jd.com/100002071812.html"+att+".html";
                item.setUrl(itemUrl);
                String src = "https://"+skk.select("img[data-sku]").first().attr("data-lazy-img");
                src.replace("/n9/","/n1/");
                String s = this.httpUtils.doGetImage(src);
                item.setPic(s);
                String price = this.httpUtils.doget("https://item.jd.com/100002071812.html" + att);
                String p = String.valueOf(MAPPER.readTree(price).get(0).get("p"));
                item.setPrice(p);
                String info = this.httpUtils.doget(itemUrl);
                String text = Jsoup.parse(info).select("div.sku-name").text();
                item.setTitle(text);
                //保存数据
                this.itemService.insert(item);
            }

        }
        
    }

}

如有不足之处请指出来 一起共同学习

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值