Java使用Matcher进行内容循环比对(爬取代理IP)

本文详细介绍了如何使用Java通过Jsoup库爬取多个代理IP网站,匹配IP正则表达式,验证IP可用性,并将有效代理IP存储到数据库的过程。代码包括了HTTP请求头配置、IP爬取与验证、数据库操作等关键步骤。
摘要由CSDN通过智能技术生成

项目说明:

此类工具的本质是进行文本匹配,会使用到正则表达式进行循环匹配,得到需要的数据,本文主要使用于代理IP的爬取,并且做代理IP的校验,同时存入数据库的操作;本教程仅作学习交流使用,请勿将此代码用于非正常用途。

代码实现:

1.引入需求的maven

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

2.配置文件:ftp.properties

在resources中添加此文件

#66代理网址
bbip = http://www.66ip.cn/page.html
#快代理网址
kuaidaili = https://free.kuaidaili.com/free/inha/page/
#yq代理
yqie = http://ip.yqie.com/proxygaoni/index_page.htm

3.工具类:发送请求并匹配返回内容

方法属性:

    //IP的正则表达式规则
    private final static String IP_REG = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}";
    //记录爬取到的ip数量
    private static int IPNumber = 0;

参数说明:String url :需要爬取代理IP的网址

                int wantNumber:需要爬取的数量(数量越多,爬取的页面就会越多。也就会更慢)

    //开始进行ip爬虫
    public HashMap<String, String> findIP (String url, int wantNumber){
        HashMap<String, String> ipMap = new HashMap<>();
        for (int i = 1; i < 100; i++) {
            String finalURL = url.replace("page",String.valueOf(i));  //组合出IP网页的地址,根据页数
            log.info("--------------------当前页数:" + i);
            log.info("--------------------当前网址:" + finalURL);
            if (wantNumber >= IPNumber){
                try {
                    Document doc = Jsoup.connect(finalURL)
                            .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                            .header("Accept-Encoding", "gzip, deflate, sdch")
                            .header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
                            .header("Cache-Control", "max-age=0")
                            .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
                            .header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
                            .header("Host", "www.kuaidaili.com")
                            .header("Referer", "http://www.kuaidaili.com/free")
                            .timeout(30 * 1000)  //设置连接超时为30秒
                            .get();
                    Matcher matcher = Pattern.compile(IP_REG).matcher(doc.text());
                    int matcher_start = 0;
                    while (matcher.find(matcher_start)){
                        String group = matcher.group();
                        String[] strs = group.split(" ");
                        String ip = strs[0];
                        String port = strs[1];
                        //检验获得的端口是否可用,调用checkProxy返回true或者false
                        if (checkProxy(ip, Integer.valueOf(port))) {
                            log.info("--------------------获取到可用代理IP:" + ip + ":" + port);
                            log.info("--------------------ip地址:" + ip);
                            log.info("--------------------端口号:" + port);
                            ipMap.put(ip, port);
                            IPNumber = IPNumber + 1;
                            log.info("--------------------可用代理IP:" + ip + ":" + port + "已入MAP集合,等待入库");
                        }
                        matcher_start = matcher.end();
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }else {
                break;
            }
        }
        return ipMap;
    }

3.工具类:验证获取的代理IP是否可用

    //验证获得的代理是否可用
    public static boolean checkProxy(String ip, Integer port) {
        try {
            //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
            Jsoup.connect("http://1212.ip138.com/ic.asp")
                    .timeout(2 * 1000)
                    .proxy(ip, port)
                    .get();
            return true;
        } catch (Exception e) {
            log.info("--------------------获取到代理IP:" + ip + ":" + port + "---访问失败");
            return false;
        }
    }

4.Controller层:

package cn.axin229913.SendMessage.Controller;

import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Service.ToolService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.IOException;

@RestController
@RequestMapping("/Tool")
@CrossOrigin    //前后端进行跨域操作
public class ToolController {

    @Autowired
    private ToolService toolService;
        
    @RequestMapping("findIP")
    public Result findIP(String web, int wantNumber) throws Exception {
        return toolService.findIP(web, wantNumber);
    }


}

5.Service层:

package cn.axin229913.SendMessage.Service;

import cn.axin229913.SendMessage.Pojo.Result;

public interface ToolService {
    
    Result findIP (String web, int wantNumber) throws Exception;

}

        Impl层:

package cn.axin229913.SendMessage.Service.Impl;

import cn.axin229913.SendMessage.Mapper.ToolMapper;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import cn.axin229913.SendMessage.Service.ToolService;
import cn.axin229913.SendMessage.Tools.ProxyTool;
import cn.axin229913.SendMessage.Tools.SendMessage;
import cn.axin229913.SendMessage.Tools.uploadTool;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;

@Slf4j
@Service
public class ToolServiceImpl implements ToolService {

    @Autowired
    private ToolMapper toolMapper;
    
    @Override
    public Result findIP (String web, int wantNumber) throws Exception {
        Result result = new Result();
        Properties properties = new Properties();
        InputStream in = uploadTool.class.getResourceAsStream("/ftp.properties");
        BufferedReader bf = new BufferedReader(new InputStreamReader(in));
        properties.load(bf);
        String website;
        switch (web){
            case "2" :
                website = properties.getProperty("kuaidaili");break;
            case "3" :
                website = properties.getProperty("yqie");break;
            case "1" :
            default :
                website = properties.getProperty("bbip");break;
        }
        ProxyTool proxyTool = new ProxyTool();
        HashMap<String, String> ipMap = proxyTool.findIP(website, wantNumber);
        for (String ip : ipMap.keySet()){
            String ip1 = ip;
            String port1 = ipMap.get(ip);
            List<ipPojo> ip2 = toolMapper.findIP(ip1, port1);
            if (ip2.isEmpty()){
                int rows = toolMapper.addIP2DB(ip1, port1, "1");
                if (rows != 0){
                    log.info("--------------------可用代理IP:" + ip1 + ":" + port1 + "已入数据");
                    result.setStatus(200).setMsg(website+"已爬取"+(wantNumber+1)+"条数据,且已入数据库");
                }else {
                    result.setStatus(200).setMsg("可用代理IP:" + ip1 + ":" + port1 + "入库错误!!!");
                    log.info("--------------------可用代理IP:" + ip1 + ":" + port1 + "入库错误!!!");
                }
            }
            result.setStatus(200).setMsg(website+"已爬取"+(wantNumber+1)+"条数据,且已入数据库");
        }
        return result;
    }
    
}

6.Mapper层:

package cn.axin229913.SendMessage.Mapper;

import cn.axin229913.SendMessage.Pojo.ipPojo;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.springframework.stereotype.Repository;

import java.util.List;

@Mapper
@Repository
public interface ToolMapper {
    
    int addIP2DB(@Param("ip")String ip,
                @Param("port")String port,
                 @Param("status")String status);
    
    List<ipPojo> findIP(@Param("ip")String ip,
                        @Param("port")String port);
    
}

7.Mapper.xml文件:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.axin229913.SendMessage.Mapper.ToolMapper">

    <!-- 通用查询映射结果 -->
    <resultMap id="IpMap" type="cn.axin229913.SendMessage.Pojo.ipPojo">
        <id column="id" property="id" />
        <result column="ip" property="ip" />
        <result column="port" property="port" />
        <result column="status" property="status" />
    </resultMap>

    <insert id="addIP2DB" parameterType="String">
        insert into Ip_Proxy(id,ip,port,status) values (null,#{ip},#{port},#{status})
    </insert>

    <select id="findIP" parameterType="String" resultMap="IpMap">
        select id from Ip_Proxy where ip = #{ip} and port = #{port} and status = 1
    </select>

</mapper>

8.Pojo类:

package cn.axin229913.SendMessage.Pojo;

import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.experimental.Accessors;

@Data
@Accessors(chain = true)
@TableName("Ip_Proxy")  //对象与表需要一一映射
public class ipPojo {
    @TableId(type = IdType.AUTO)  //表示主键自增
    private Integer id;
    private String ip;
    private String port;
    private String status;
}

9.数据库字段执行sql:

数据库字段说明:

id:主键自增

ip:IP号

port:端口号

status:ip的状态,默认是1,表示正常使用

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for Ip_Proxy
-- ----------------------------
DROP TABLE IF EXISTS `Ip_Proxy`;
CREATE TABLE `Ip_Proxy`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `ip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
  `port` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
  `status` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 21 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

10.启动效果图:

 

 

 

 方法到此处已完结!!!

如有错误,往大神指正!!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值