记录一次网页爬取快代理免费IP代理的方式使用JAVA实现

最新推荐文章于 2024-05-09 18:07:52 发布

梦醒三叹

最新推荐文章于 2024-05-09 18:07:52 发布

阅读量521

点赞数

分类专栏： JAVA 爬虫多线程

本文链接：https://blog.csdn.net/qq_43638033/article/details/117201911

版权

JAVA 同时被 3 个专栏收录

6 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

多线程

1 篇文章 0 订阅

订阅专栏

记录一次网页爬取快代理免费IP代理的方式使用JAVA实现

一：使用 HttpClient + Jsoup 获取并解析网页源代码（多线程）

创建 IPEntity类（实体类）

package edu.liuzheng;

/**
 * @author qingyun
 * @version 1.0
 * @date 2021/5/23 20:41
 */
public class IPEntity {


    /**
     * IP
     */
    public String IP;

    /**
     * 端口
     */
    public String PORT;

    /**
     * 匿名度
     */
    public String Anonymity;

    /**
     * 类型
     */
    public String type;

    /**
     * 位置
     */
    public String position;

    /**
     * 响应时间
     */
    public String  ResponseSpeed;

    /**
     * 最后验证时间
     */
    public String LastVerificationTime;



    public String getIP() {
        return IP;
    }

    public void setIP(String IP) {
        this.IP = IP;
    }

    public String getPORT() {
        return PORT;
    }

    public void setPORT(String PORT) {
        this.PORT = PORT;
    }

    public String getAnonymity() {
        return Anonymity;
    }

    public void setAnonymity(String anonymity) {
        Anonymity = anonymity;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getPosition() {
        return position;
    }

    public void setPosition(String position) {
        this.position = position;
    }

    public String getResponseSpeed() {
        return ResponseSpeed;
    }

    public void setResponseSpeed(String responseSpeed) {
        ResponseSpeed = responseSpeed;
    }

    public String getLastVerificationTime() {
        return LastVerificationTime;
    }

    public void setLastVerificationTime(String lastVerificationTime) {
        LastVerificationTime = lastVerificationTime;
    }


    @Override
    public String toString() {
        return "IPEntity{" +
                "IP='" + IP + '\'' +
                ", PORT='" + PORT + '\'' +
                ", Anonymity='" + Anonymity + '\'' +
                ", type='" + type + '\'' +
                ", position='" + position + '\'' +
                ", ResponseSpeed='" + ResponseSpeed + '\'' +
                ", LastVerificationTime='" + LastVerificationTime + '\'' +
                '}';
    }

}

创建 CrawlingTest （主要方法类）

package edu.liuzheng;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 使用 HttpClient + Jsoup  获取并解析网页源代码
 * 继承线程类
 * @author qingyun
 * @version 1.0
 * @date 2021/5/23 21:36
 */
public class CrawlingTest implements Runnable{

    /**
     * 存储IP信息List
     */
    static List<IPEntity> ipEntities = new ArrayList<>();
    /**
     * 设置爬取的页面（使用的多线程故设置属性为AtomicInteger   防止线程错误）
     */
    public static AtomicInteger index = new AtomicInteger(1);

    public CrawlingTest() {

    }

    public static  void  getIPList(){
        //1.生成httpclient，相当于该打开一个浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        //2.创建get请求，相当于在浏览器地址栏输入 网址
        HttpGet request = new HttpGet("https://www.kuaidaili.com/free/inha/"+index+"/");
        //设置请求头，将爬虫伪装成浏览器（如果无伪装可能无法接收信息）
        request.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
        try {
            //3.执行get请求，相当于在输入地址栏后敲回车键
            response = httpClient.execute(request);
            //输出相应状态信息
            System.out.println(response.getStatusLine().getStatusCode());
            //4.判断响应状态为200，进行处理
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.获取响应内容
                HttpEntity httpEntity = response.getEntity();
                String html = EntityUtils.toString(httpEntity, "utf-8");
//                System.out.println(html);
                /**
                 * 下面是Jsoup展现自我的平台
                 */
                //6.Jsoup解析html
                Document document = Jsoup.parse(html);

                //像js一样，查找到第一个表格信息
                Element postList = document.selectFirst("tbody");
                //像js一样，获取tr
                Elements postItems = postList.select("tr");
                //循环处理每篇博客

                for (Element postItem : postItems) {
                    //解析TD信息
                    Elements titleEle = postItem.select("td");
                    IPEntity ipEntity = new IPEntity();
                    System.out.println("IP:" + titleEle.get(0).text());
                    ipEntity.setIP(titleEle.get(0).text());
                    System.out.println("PORT:" + titleEle.get(1).text());
                    ipEntity.setPORT(titleEle.get(1).text());
                    System.out.println("匿名度:" + titleEle.get(2).text());
                    ipEntity.setAnonymity(titleEle.get(2).text());
                    System.out.println("类型:" + titleEle.get(3).text());
                    ipEntity.setType(titleEle.get(3).text());
                    System.out.println("位置:" + titleEle.get(4).text());
                    ipEntity.setPosition(titleEle.get(4).text());
                    System.out.println("响应速度:" + titleEle.get(5).text());
                    ipEntity.setResponseSpeed(titleEle.get(5).text());
                    System.out.println("最后验证时间:" + titleEle.get(6).text());
                    ipEntity.setLastVerificationTime(titleEle.get(6).text());
                    //存储信息
                    ipEntities.add(ipEntity);
                }
                System.out.println("第"+index+"页完成");
            } else {
                //如果返回状态不是200，比如404（页面不存在）等，根据情况做处理，这里略
                System.out.println("返回状态不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //6.关闭流
            HttpClientUtils.closeQuietly(response);
            HttpClientUtils.closeQuietly(httpClient);
            //获得当前值并且加1
            index.getAndIncrement();
        }
    }

    @Override
    public void run() {
        //多线程执行的主体方法
        getIPList();
    }
}

二：设置 ScheduledExecutorService 线程池子执行多线程任务

创建 HttpClientTest 类

package edu.liuzheng;

import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

public class HttpClientTest {
    public static void main(String[] args) {
        /**
         * 创建定时任务线程池  10 corePoolSize  核心池大小
         */
        ScheduledExecutorService service = Executors.newScheduledThreadPool(10);
        //初次延迟
        long initialDelay = 1;
        //延迟时间
        long period = 3;
        //执行任务下载任务
        service.scheduleAtFixedRate(new CrawlingTest(), initialDelay, period, TimeUnit.SECONDS);
    }
}

结果如下

在这里插入图片描述

梦醒三叹

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
记录一次网页爬取快代理免费IP代理的方式使用JAVA实现

记录一次网页爬取快代理免费IP代理的方式使用JAVA实现一：使用 HttpClient + Jsoup 获取并解析网页源代码（多线程）创建 IPEntity类（实体类）package edu.liuzheng;/** * @author qingyun * @version 1.0 * @date 2021/5/23 20:41 */public class IPEntity { /** * IP */ public Stri
复制链接

扫一扫