项目说明:
此类工具的本质是进行文本匹配,会使用到正则表达式进行循环匹配,得到需要的数据,本文主要使用于代理IP的爬取,并且做代理IP的校验,同时存入数据库的操作;本教程仅作学习交流使用,请勿将此代码用于非正常用途。
代码实现:
1.引入需求的maven
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2.配置文件:ftp.properties
在resources中添加此文件
#66代理网址
bbip = http://www.66ip.cn/page.html
#快代理网址
kuaidaili = https://free.kuaidaili.com/free/inha/page/
#yq代理
yqie = http://ip.yqie.com/proxygaoni/index_page.htm
3.工具类:发送请求并匹配返回内容
方法属性:
//IP的正则表达式规则
private final static String IP_REG = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}";
//记录爬取到的ip数量
private static int IPNumber = 0;
参数说明:String url :需要爬取代理IP的网址
int wantNumber:需要爬取的数量(数量越多,爬取的页面就会越多。也就会更慢)
//开始进行ip爬虫
public HashMap<String, String> findIP (String url, int wantNumber){
HashMap<String, String> ipMap = new HashMap<>();
for (int i = 1; i < 100; i++) {
String finalURL = url.replace("page",String.valueOf(i)); //组合出IP网页的地址,根据页数
log.info("--------------------当前页数:" + i);
log.info("--------------------当前网址:" + finalURL);
if (wantNumber >= IPNumber){
try {
Document doc = Jsoup.connect(finalURL)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate, sdch")
.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
.header("Cache-Control", "max-age=0")
.header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
.header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
.header("Host", "www.kuaidaili.com")
.header("Referer", "http://www.kuaidaili.com/free")
.timeout(30 * 1000) //设置连接超时为30秒
.get();
Matcher matcher = Pattern.compile(IP_REG).matcher(doc.text());
int matcher_start = 0;
while (matcher.find(matcher_start)){
String group = matcher.group();
String[] strs = group.split(" ");
String ip = strs[0];
String port = strs[1];
//检验获得的端口是否可用,调用checkProxy返回true或者false
if (checkProxy(ip, Integer.valueOf(port))) {
log.info("--------------------获取到可用代理IP:" + ip + ":" + port);
log.info("--------------------ip地址:" + ip);
log.info("--------------------端口号:" + port);
ipMap.put(ip, port);
IPNumber = IPNumber + 1;
log.info("--------------------可用代理IP:" + ip + ":" + port + "已入MAP集合,等待入库");
}
matcher_start = matcher.end();
}
} catch (Exception e) {
e.printStackTrace();
}
}else {
break;
}
}
return ipMap;
}
3.工具类:验证获取的代理IP是否可用
//验证获得的代理是否可用
public static boolean checkProxy(String ip, Integer port) {
try {
//http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
Jsoup.connect("http://1212.ip138.com/ic.asp")
.timeout(2 * 1000)
.proxy(ip, port)
.get();
return true;
} catch (Exception e) {
log.info("--------------------获取到代理IP:" + ip + ":" + port + "---访问失败");
return false;
}
}
4.Controller层:
package cn.axin229913.SendMessage.Controller;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Service.ToolService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
@RestController
@RequestMapping("/Tool")
@CrossOrigin //前后端进行跨域操作
public class ToolController {
@Autowired
private ToolService toolService;
@RequestMapping("findIP")
public Result findIP(String web, int wantNumber) throws Exception {
return toolService.findIP(web, wantNumber);
}
}
5.Service层:
package cn.axin229913.SendMessage.Service;
import cn.axin229913.SendMessage.Pojo.Result;
public interface ToolService {
Result findIP (String web, int wantNumber) throws Exception;
}
Impl层:
package cn.axin229913.SendMessage.Service.Impl;
import cn.axin229913.SendMessage.Mapper.ToolMapper;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import cn.axin229913.SendMessage.Service.ToolService;
import cn.axin229913.SendMessage.Tools.ProxyTool;
import cn.axin229913.SendMessage.Tools.SendMessage;
import cn.axin229913.SendMessage.Tools.uploadTool;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
@Slf4j
@Service
public class ToolServiceImpl implements ToolService {
@Autowired
private ToolMapper toolMapper;
@Override
public Result findIP (String web, int wantNumber) throws Exception {
Result result = new Result();
Properties properties = new Properties();
InputStream in = uploadTool.class.getResourceAsStream("/ftp.properties");
BufferedReader bf = new BufferedReader(new InputStreamReader(in));
properties.load(bf);
String website;
switch (web){
case "2" :
website = properties.getProperty("kuaidaili");break;
case "3" :
website = properties.getProperty("yqie");break;
case "1" :
default :
website = properties.getProperty("bbip");break;
}
ProxyTool proxyTool = new ProxyTool();
HashMap<String, String> ipMap = proxyTool.findIP(website, wantNumber);
for (String ip : ipMap.keySet()){
String ip1 = ip;
String port1 = ipMap.get(ip);
List<ipPojo> ip2 = toolMapper.findIP(ip1, port1);
if (ip2.isEmpty()){
int rows = toolMapper.addIP2DB(ip1, port1, "1");
if (rows != 0){
log.info("--------------------可用代理IP:" + ip1 + ":" + port1 + "已入数据");
result.setStatus(200).setMsg(website+"已爬取"+(wantNumber+1)+"条数据,且已入数据库");
}else {
result.setStatus(200).setMsg("可用代理IP:" + ip1 + ":" + port1 + "入库错误!!!");
log.info("--------------------可用代理IP:" + ip1 + ":" + port1 + "入库错误!!!");
}
}
result.setStatus(200).setMsg(website+"已爬取"+(wantNumber+1)+"条数据,且已入数据库");
}
return result;
}
}
6.Mapper层:
package cn.axin229913.SendMessage.Mapper;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.springframework.stereotype.Repository;
import java.util.List;
@Mapper
@Repository
public interface ToolMapper {
int addIP2DB(@Param("ip")String ip,
@Param("port")String port,
@Param("status")String status);
List<ipPojo> findIP(@Param("ip")String ip,
@Param("port")String port);
}
7.Mapper.xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.axin229913.SendMessage.Mapper.ToolMapper">
<!-- 通用查询映射结果 -->
<resultMap id="IpMap" type="cn.axin229913.SendMessage.Pojo.ipPojo">
<id column="id" property="id" />
<result column="ip" property="ip" />
<result column="port" property="port" />
<result column="status" property="status" />
</resultMap>
<insert id="addIP2DB" parameterType="String">
insert into Ip_Proxy(id,ip,port,status) values (null,#{ip},#{port},#{status})
</insert>
<select id="findIP" parameterType="String" resultMap="IpMap">
select id from Ip_Proxy where ip = #{ip} and port = #{port} and status = 1
</select>
</mapper>
8.Pojo类:
package cn.axin229913.SendMessage.Pojo;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.experimental.Accessors;
@Data
@Accessors(chain = true)
@TableName("Ip_Proxy") //对象与表需要一一映射
public class ipPojo {
@TableId(type = IdType.AUTO) //表示主键自增
private Integer id;
private String ip;
private String port;
private String status;
}
9.数据库字段执行sql:
数据库字段说明:
id:主键自增
ip:IP号
port:端口号
status:ip的状态,默认是1,表示正常使用
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for Ip_Proxy
-- ----------------------------
DROP TABLE IF EXISTS `Ip_Proxy`;
CREATE TABLE `Ip_Proxy` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`ip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
`port` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
`status` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 21 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
10.启动效果图:
方法到此处已完结!!!
如有错误,往大神指正!!