说一下webMagic我所出现的问题吧, 我是因为连了公司的内网所以导致无法去解析页面, 因为像在webMagic中你去使用它解析页面是必需得要代理的, 我之前就主要是没有考虑这个问题,下面我附上解决代码:
package ai.zerox.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
/**
* @description 配置代理去下载页面
* @author AiShuangPeng
* @date 2023/7/19 10:38
*/
public class CustomProxyProvider implements ProxyProvider {
@Override
public void returnProxy(Proxy proxy, Page page, Task task) {
}
@Override
public Proxy getProxy(Task task) {
// 在这里从代理IP池中获取代理IP
String host = "填写公司内网ip";
int port = 填写端口号;
return new Proxy(host, port);
}
}
package ai.zerox.proxy;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
public class CustomDownloader extends HttpClientDownloader {
public CustomDownloader() {
this.setProxyProvider(new CustomProxyProvider());
}
}
package ai.zerox.crawlers;
import ai.zerox.proxy.CustomDownloader;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;
/**
* @author AiShuangPeng
* @version 1.0
* @description 初始化爬虫
* @date 2023/7/19 20:01
*/
@Component
public class InfoSpider {
@Autowired
private PageProcessor processor;
@Autowired
private Pipeline pipeline;
@Autowired
private Scheduler scheduler;
//爬取地址
private final String URL = "http://*****************/index.html"; //填写自己需要爬取的地址即可
/**
* @description 该方法调用时将会调用多个组件并启动爬虫进行数据爬取
* @author AiShuangPeng
* @date 2023/7/19 8:34
*/
public void doCrawler() {
Spider.create(processor)
//设置自定义得pipeline
.addPipeline(pipeline)
//配置scheduler 指定使用布隆过滤器
.setScheduler(scheduler)
//设置起始url
.addUrl(URL)
//配置代理(当用的是公司内网时需要去配置)
.setDownloader(new CustomDownloader())
//启动爬虫
.start();
}
}
package ai.zerox.crawlers;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
/**
* @author AiShuangPeng
* @version 1.0
* @description 配置过滤器
* @date 2023/5/22 18:07
*/
@Configuration
public class InfoScheduler {
/**
* @description 布隆过滤去重
* @return scheduler
* @author AiShuangPeng
* @date 2023/7/19 21:01
*/
@Bean
public Scheduler crateScheduler(){
QueueScheduler queueScheduler = new QueueScheduler();
queueScheduler.setDuplicateRemover(new BloomFilterDuplicateRemover(10000000));
return queueScheduler;
}
}
package ai.zerox.crawlers;
import ai.zerox.pojo.Info;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/**
* @description 实现页面分析的业务逻辑
* @author AiShuangPeng
* @date 2023/5/19 21:09
*/
@Component
public class InfoProcessor implements PageProcessor {
//配置文件信息
private static final Site SITE = Site.me()
// 设置字符编码集
.setCharset("UTF-8")
// 设置Http连接重试次数
.setRetryTimes(30)
// 设置线程休眠时间
.setSleepTime(2000);
@Override
public void process(Page page) {
//获取首页地址
Html html = page.getHtml();
List<Selectable> nodes = html.css("li > span", "text").nodes();
//如果大于零则代表该页面是列表页面
if (nodes.size() > 0) {
//解析列表的url
List<String> linksAll = html.css(".conter_listCont.AllListCon li").links().all();
//添加到队列中
page.addTargetRequests(linksAll);
//解析下一页列表的url
String links = String.valueOf(html.css(".page div .next").links());
page.addTargetRequest(links);
//不向pipeline传递数据
page.getResultItems().setSkip(true);
} else {
this.parseInfo(page);
}
}
/**
* 解析详情页
*/
public void parseInfo(Page page) {
//解析页面中的内容
Html html = page.getHtml();
//主题名称
String title = html.css("h1", "text").get();
if (title.length() < 1) {
title = null;
}
List<String> h6SpanAll = html.css("h6 span", "text").all();
String information = null;
String ReleaseTime = null;
if (h6SpanAll.size() > 0) {
//信息来源
information = h6SpanAll.get(0).substring(5);
//发布时间
ReleaseTime = h6SpanAll.get(1).substring(5);
}
//发布内容
List<String> contentAll = html.css(".news_cont_d_wrap p", "text").all();
StringBuffer stringBuffer = new StringBuffer();
for (int i = 0; i < contentAll.size() - 2; i++) {
stringBuffer.append(contentAll.get(i));
}
String content = String.valueOf(stringBuffer);
if (content.length() < 1){
content = null;
}
List<String> linksAll = html.css("div .fjdown").links().all();
//page.addTargetRequests(linksAll);
//附件文件路劲
String annexFile = linksAll.toString();
if (annexFile.length() < 3) {
annexFile = null;
}
//封装到实体类中
Info info = new Info();
info.setContent(content);
info.setTitle(title);
info.setAnnexFile(annexFile);
info.setReleaseDate(ReleaseTime);
info.setInformation(information);
//将对象传递给pipeline
page.putField("info", info);
}
@Override
public Site getSite() {
return SITE;
}
}
package ai.zerox.crawlers;
import ai.zerox.mapper.InfoMapper;
import ai.zerox.pojo.Info;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author AiShuangPeng
* @version 1.0
* @description 数据持久化,把信息保存到数据库
* @date 2023/7/19 20:46
*/
@Component
public class InfoPipeline implements Pipeline {
@Autowired
private InfoMapper infoMapper;
@Override
@Transactional
public void process(ResultItems resultItems, Task task) {
//取PageProcessor传递过来得数据
Info info = resultItems.get("info");
//在确定有数据的情况下把数据保存到数据库
if (!info.getTitle().isEmpty()
|| !info.getContent().isEmpty()
|| !info.getAnnexFile().isEmpty()
|| !info.getReleaseDate().isEmpty()
|| !info.getInformation().isEmpty()) {
infoMapper.insert(info);
}
}
}
package ai.zerox.controller;
import ai.zerox.crawlers.InfoSpider;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* @author AiShuangPeng
* @version 1.0
* @description
* @date 2023/7/19 20:12
*/
@RestController
public class InfoController {
@Autowired
InfoSpider testSpider;
@RequestMapping("/test")
public String doTest() {
testSpider.doCrawler();
return "ok";
}
}
maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.7.12</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.example</groupId>
<artifactId>webmagic</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>demo</name>
<description>demo</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- 配置代理依赖项 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.4</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!--添加 webMagic 所需依赖-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.4.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.27</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>
最后它的一个yml配置文件,各位就自己去配置一下吧,喜欢记得点个关注,后期会持续更新不同所遇到的一系列问题,谢谢!