记录一次网页爬取 快代理 免费IP代理的方式 使用JAVA实现

1 篇文章 0 订阅
1 篇文章 0 订阅

记录一次网页爬取 快代理 免费IP代理的方式 使用JAVA实现

一: 使用 HttpClient + Jsoup 获取并解析网页源代码(多线程)

​ 创建 IPEntity类(实体类)

package edu.liuzheng;

/**
 * @author qingyun
 * @version 1.0
 * @date 2021/5/23 20:41
 */
public class IPEntity {


    /**
     * IP
     */
    public String IP;

    /**
     * 端口
     */
    public String PORT;

    /**
     * 匿名度
     */
    public String Anonymity;

    /**
     * 类型
     */
    public String type;

    /**
     * 位置
     */
    public String position;

    /**
     * 响应时间
     */
    public String  ResponseSpeed;

    /**
     * 最后验证时间
     */
    public String LastVerificationTime;



    public String getIP() {
        return IP;
    }

    public void setIP(String IP) {
        this.IP = IP;
    }

    public String getPORT() {
        return PORT;
    }

    public void setPORT(String PORT) {
        this.PORT = PORT;
    }

    public String getAnonymity() {
        return Anonymity;
    }

    public void setAnonymity(String anonymity) {
        Anonymity = anonymity;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getPosition() {
        return position;
    }

    public void setPosition(String position) {
        this.position = position;
    }

    public String getResponseSpeed() {
        return ResponseSpeed;
    }

    public void setResponseSpeed(String responseSpeed) {
        ResponseSpeed = responseSpeed;
    }

    public String getLastVerificationTime() {
        return LastVerificationTime;
    }

    public void setLastVerificationTime(String lastVerificationTime) {
        LastVerificationTime = lastVerificationTime;
    }


    @Override
    public String toString() {
        return "IPEntity{" +
                "IP='" + IP + '\'' +
                ", PORT='" + PORT + '\'' +
                ", Anonymity='" + Anonymity + '\'' +
                ", type='" + type + '\'' +
                ", position='" + position + '\'' +
                ", ResponseSpeed='" + ResponseSpeed + '\'' +
                ", LastVerificationTime='" + LastVerificationTime + '\'' +
                '}';
    }

}

​ 创建 CrawlingTest (主要方法类)

package edu.liuzheng;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 使用 HttpClient + Jsoup  获取并解析网页源代码
 * 继承线程类
 * @author qingyun
 * @version 1.0
 * @date 2021/5/23 21:36
 */
public class CrawlingTest implements Runnable{

    /**
     * 存储IP信息List
     */
    static List<IPEntity> ipEntities = new ArrayList<>();
    /**
     * 设置爬取的页面(使用的多线程故设置属性为AtomicInteger   防止线程错误)
     */
    public static AtomicInteger index = new AtomicInteger(1);

    public CrawlingTest() {

    }

    public static  void  getIPList(){
        //1.生成httpclient,相当于该打开一个浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        //2.创建get请求,相当于在浏览器地址栏输入 网址
        HttpGet request = new HttpGet("https://www.kuaidaili.com/free/inha/"+index+"/");
        //设置请求头,将爬虫伪装成浏览器(如果无伪装可能无法接收信息)
        request.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
        try {
            //3.执行get请求,相当于在输入地址栏后敲回车键
            response = httpClient.execute(request);
            //输出相应状态信息
            System.out.println(response.getStatusLine().getStatusCode());
            //4.判断响应状态为200,进行处理
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.获取响应内容
                HttpEntity httpEntity = response.getEntity();
                String html = EntityUtils.toString(httpEntity, "utf-8");
//                System.out.println(html);
                /**
                 * 下面是Jsoup展现自我的平台
                 */
                //6.Jsoup解析html
                Document document = Jsoup.parse(html);

                //像js一样,查找到第一个表格信息
                Element postList = document.selectFirst("tbody");
                //像js一样,获取tr
                Elements postItems = postList.select("tr");
                //循环处理每篇博客

                for (Element postItem : postItems) {
                    //解析TD信息
                    Elements titleEle = postItem.select("td");
                    IPEntity ipEntity = new IPEntity();
                    System.out.println("IP:" + titleEle.get(0).text());
                    ipEntity.setIP(titleEle.get(0).text());
                    System.out.println("PORT:" + titleEle.get(1).text());
                    ipEntity.setPORT(titleEle.get(1).text());
                    System.out.println("匿名度:" + titleEle.get(2).text());
                    ipEntity.setAnonymity(titleEle.get(2).text());
                    System.out.println("类型:" + titleEle.get(3).text());
                    ipEntity.setType(titleEle.get(3).text());
                    System.out.println("位置:" + titleEle.get(4).text());
                    ipEntity.setPosition(titleEle.get(4).text());
                    System.out.println("响应速度:" + titleEle.get(5).text());
                    ipEntity.setResponseSpeed(titleEle.get(5).text());
                    System.out.println("最后验证时间:" + titleEle.get(6).text());
                    ipEntity.setLastVerificationTime(titleEle.get(6).text());
                    //存储信息
                    ipEntities.add(ipEntity);
                }
                System.out.println("第"+index+"页完成");
            } else {
                //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                System.out.println("返回状态不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //6.关闭流
            HttpClientUtils.closeQuietly(response);
            HttpClientUtils.closeQuietly(httpClient);
            //获得当前值并且加1
            index.getAndIncrement();
        }
    }

    @Override
    public void run() {
        //多线程执行的主体方法
        getIPList();
    }
}

二: 设置 ScheduledExecutorService 线程池子 执行多线程任务

创建 HttpClientTest 类

package edu.liuzheng;

import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

public class HttpClientTest {
    public static void main(String[] args) {
        /**
         * 创建定时任务线程池  10 corePoolSize  核心池大小
         */
        ScheduledExecutorService service = Executors.newScheduledThreadPool(10);
        //初次延迟
        long initialDelay = 1;
        //延迟时间
        long period = 3;
        //执行任务下载任务
        service.scheduleAtFixedRate(new CrawlingTest(), initialDelay, period, TimeUnit.SECONDS);
    }
}

结果如下

在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值