WebMagic 一系列问题

说一下webMagic我所出现的问题吧, 我是因为连了公司的内网所以导致无法去解析页面, 因为像在webMagic中你去使用它解析页面是必需得要代理的, 我之前就主要是没有考虑这个问题,下面我附上解决代码:

package ai.zerox.proxy;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
/**
 * @description 配置代理去下载页面
 * @author AiShuangPeng
 * @date 2023/7/19 10:38
*/
public class CustomProxyProvider implements ProxyProvider {
  @Override
  public void returnProxy(Proxy proxy, Page page, Task task) {
  }
  @Override
  public Proxy getProxy(Task task) {
    // 在这里从代理IP池中获取代理IP
    String host = "填写公司内网ip";
    int port = 填写端口号;
    return new Proxy(host, port);
  }
}
package ai.zerox.proxy;
import us.codecraft.webmagic.downloader.HttpClientDownloader;

public class CustomDownloader extends HttpClientDownloader {
  public CustomDownloader() {
    this.setProxyProvider(new CustomProxyProvider());
  }
}
package ai.zerox.crawlers;

import ai.zerox.proxy.CustomDownloader;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;

/**
 * @author AiShuangPeng
 * @version 1.0
 * @description 初始化爬虫
 * @date 2023/7/19 20:01
 */
@Component
public class InfoSpider {
  @Autowired
  private PageProcessor processor;
  @Autowired
  private Pipeline pipeline;

  @Autowired
  private Scheduler scheduler;

  //爬取地址
  private final String URL = "http://*****************/index.html"; //填写自己需要爬取的地址即可

  /**
   * @description 该方法调用时将会调用多个组件并启动爬虫进行数据爬取
   * @author AiShuangPeng
   * @date 2023/7/19 8:34
  */
  public void doCrawler() {
    Spider.create(processor)
      //设置自定义得pipeline
      .addPipeline(pipeline)
      //配置scheduler 指定使用布隆过滤器
      .setScheduler(scheduler)
      //设置起始url
      .addUrl(URL)
      //配置代理(当用的是公司内网时需要去配置)
      .setDownloader(new CustomDownloader())
      //启动爬虫
      .start();
  }
}
package ai.zerox.crawlers;

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;

/**
 * @author AiShuangPeng
 * @version 1.0
 * @description 配置过滤器
 * @date 2023/5/22 18:07
 */
@Configuration
public class InfoScheduler {

  /**
   * @description 布隆过滤去重
   * @return scheduler
   * @author AiShuangPeng
   * @date 2023/7/19 21:01
  */
  @Bean
  public Scheduler crateScheduler(){
    QueueScheduler queueScheduler = new QueueScheduler();
    queueScheduler.setDuplicateRemover(new BloomFilterDuplicateRemover(10000000));
    return queueScheduler;
  }
}
package ai.zerox.crawlers;

import ai.zerox.pojo.Info;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;

/**
 * @description 实现页面分析的业务逻辑
 * @author AiShuangPeng
 * @date 2023/5/19 21:09
 */
@Component
public class InfoProcessor implements PageProcessor {

  //配置文件信息
  private static final Site SITE = Site.me()
    // 设置字符编码集
    .setCharset("UTF-8")
    // 设置Http连接重试次数
    .setRetryTimes(30)
    // 设置线程休眠时间
    .setSleepTime(2000);

  @Override
  public void process(Page page) {
    //获取首页地址
    Html html = page.getHtml();
    List<Selectable> nodes = html.css("li > span", "text").nodes();
    //如果大于零则代表该页面是列表页面
    if (nodes.size() > 0) {
      //解析列表的url
      List<String> linksAll = html.css(".conter_listCont.AllListCon li").links().all();
      //添加到队列中
      page.addTargetRequests(linksAll);
      //解析下一页列表的url
      String links = String.valueOf(html.css(".page div .next").links());
      page.addTargetRequest(links);
      //不向pipeline传递数据
      page.getResultItems().setSkip(true);

    } else {
      this.parseInfo(page);
    }
  }

  /**
   * 解析详情页
   */
  public void parseInfo(Page page) {
    //解析页面中的内容
    Html html = page.getHtml();
    //主题名称
    String title = html.css("h1", "text").get();
    if (title.length() < 1) {
      title = null;
    }
    List<String> h6SpanAll = html.css("h6 span", "text").all();
    String information = null;
    String ReleaseTime = null;
    if (h6SpanAll.size() > 0) {
      //信息来源
      information = h6SpanAll.get(0).substring(5);
      //发布时间
      ReleaseTime = h6SpanAll.get(1).substring(5);
    }

    //发布内容
    List<String> contentAll = html.css(".news_cont_d_wrap p", "text").all();
    StringBuffer stringBuffer = new StringBuffer();
    for (int i = 0; i < contentAll.size() - 2; i++) {
      stringBuffer.append(contentAll.get(i));
    }
    String content = String.valueOf(stringBuffer);
    if (content.length() < 1){
      content = null;
    }

    List<String> linksAll = html.css("div .fjdown").links().all();
    //page.addTargetRequests(linksAll);
    //附件文件路劲
    String annexFile = linksAll.toString();
    if (annexFile.length() < 3) {
      annexFile = null;
    }

      //封装到实体类中
      Info info = new Info();
      info.setContent(content);
      info.setTitle(title);
      info.setAnnexFile(annexFile);
      info.setReleaseDate(ReleaseTime);
      info.setInformation(information);
      //将对象传递给pipeline
      page.putField("info", info);

  }

  @Override
  public Site getSite() {
    return SITE;
  }

}
package ai.zerox.crawlers;

import ai.zerox.mapper.InfoMapper;
import ai.zerox.pojo.Info;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/**
 * @author AiShuangPeng
 * @version 1.0
 * @description 数据持久化,把信息保存到数据库
 * @date 2023/7/19 20:46
 */
@Component
public class InfoPipeline implements Pipeline {
  @Autowired
  private InfoMapper infoMapper;

  @Override
  @Transactional
  public void process(ResultItems resultItems, Task task) {
    //取PageProcessor传递过来得数据
    Info info = resultItems.get("info");
    //在确定有数据的情况下把数据保存到数据库
    if (!info.getTitle().isEmpty()
      || !info.getContent().isEmpty()
      || !info.getAnnexFile().isEmpty()
      || !info.getReleaseDate().isEmpty()
      || !info.getInformation().isEmpty()) {
      infoMapper.insert(info);
    }
  }
}
package ai.zerox.controller;

import ai.zerox.crawlers.InfoSpider;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

/**
 * @author AiShuangPeng
 * @version 1.0
 * @description
 * @date 2023/7/19 20:12
 */
@RestController
public class InfoController {
  @Autowired
  InfoSpider testSpider;

  @RequestMapping("/test")
  public String doTest() {
    testSpider.doCrawler();
    return "ok";
  }
}

maven依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <parent>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-parent</artifactId>
    <version>2.7.12</version>
    <relativePath/> <!-- lookup parent from repository -->
  </parent>
  <groupId>com.example</groupId>
  <artifactId>webmagic</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>demo</name>
  <description>demo</description>
  <properties>
    <java.version>1.8</java.version>
  </properties>
  <dependencies>
    <dependency>
      <groupId>org.springframework.boot</groupId>
      <artifactId>spring-boot-starter-web</artifactId>
    </dependency>

    <!-- 配置代理依赖项 -->
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.13</version>
    </dependency>
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpcore</artifactId>
      <version>4.4.4</version>
    </dependency>

    <dependency>
      <groupId>org.projectlombok</groupId>
      <artifactId>lombok</artifactId>
      <optional>true</optional>
    </dependency>
    <dependency>
      <groupId>org.springframework.boot</groupId>
      <artifactId>spring-boot-starter-test</artifactId>
      <scope>test</scope>
    </dependency>
    <!--添加 webMagic 所需依赖-->
    <dependency>
      <groupId>us.codecraft</groupId>
      <artifactId>webmagic-core</artifactId>
      <version>0.7.3</version>
    </dependency>
    <dependency>
      <groupId>us.codecraft</groupId>
      <artifactId>webmagic-extension</artifactId>
      <version>0.7.3</version>
    </dependency>
    <dependency>
      <groupId>com.google.guava</groupId>
      <artifactId>guava</artifactId>
      <version>16.0</version>
    </dependency>

    <dependency>
      <groupId>com.baomidou</groupId>
      <artifactId>mybatis-plus-boot-starter</artifactId>
      <version>3.4.3</version>
    </dependency>

    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>8.0.27</version>
    </dependency>
  </dependencies>

  <build>
    <plugins>
      <plugin>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-maven-plugin</artifactId>
        <configuration>
          <excludes>
            <exclude>
              <groupId>org.projectlombok</groupId>
              <artifactId>lombok</artifactId>
            </exclude>
          </excludes>
        </configuration>
      </plugin>
    </plugins>
  </build>

</project>

最后它的一个yml配置文件,各位就自己去配置一下吧,喜欢记得点个关注,后期会持续更新不同所遇到的一系列问题,谢谢! 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值