pom坐标:
<dependencies>
<!--SpringMVC-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--通用mapper起步依赖-->
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper-spring-boot-starter</artifactId>
<version>2.0.4</version>
</dependency>
<!--MySQL连接包-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!--WebMagic核心包-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--WebMagic扩展-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--WebMagic对布隆过滤器的支持-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
<!--工具包-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<!--单元测试-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
<!--simhash-->
<dependency>
<groupId>com.lou</groupId>
<artifactId>simhasher</artifactId>
<version>0.0.1-SNAPSHOT</version>
<exclusions>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
AccountProcessor:
import com.lol.pojo.Account;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.List;
@Component
public class AccountProcessor implements PageProcessor {
//搜索条件:lol,类型账号,价格升序
private String url = "https://www.jiaoyimao.com/g5654-c1/r4.html";
@Override
public void process(Page page) {
//解析页面
Html html = page.getHtml();
//获取数据,封装到对象中
List<Selectable> list = page.getHtml().css("div.bd ul.specialList li").nodes();
List<Account> accounts = new ArrayList<>();
for (Selectable selectable : list) {
Account account = new Account();
account.setTitle(selectable.css("span.is-account a","text").toString());
account.setPrice(selectable.css("span.price","text").toString());
account.setUrl(selectable.css("span.is-account a").links().toString());
account.setArea(selectable.css("div.con","text").toString());
System.out.println(account);
//把结果保存起来
// page.putField("account",account);
accounts.add(account);
page.putField("accounts",accounts);
}
//获取下一页的url
List<Selectable> nodes = page.getHtml().css("a.page-btn").nodes();
String bkUrl = null;
if (nodes.size() > 1){
//第二页之后
bkUrl = page.getHtml().css("a.page-btn").nodes().get(1).links().toString();
}else {
bkUrl = page.getHtml().css("a.page-btn").links().toString();
}
//把url放到任务队列中
page.addTargetRequest(bkUrl);
}
private Site site = Site.me()
.setCharset("utf-8")//设置编码
.setTimeOut(10 * 1000)//设置超时时间
.setRetrySleepTime(3000)//设置重试的间隔时间
.setRetryTimes(3);//设置重试的次数
@Override
public Site getSite() {
return site;
}
@Autowired
private SpringDataPipeline springDataPipeline;
//initialDelay当任务启动后,等等多久执行方法
//fixedDelay每个多久执行方法
// @Scheduled(initialDelay = 1000, fixedDelay = 1 * 1000)
@Scheduled(fixedDelay = 500)
public void process() {
Spider.create(new AccountProcessor())
.addUrl(url)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(10)
.addPipeline(this.springDataPipeline)
.run();
}
}
SpringDataPipeline
import com.lol.pojo.Account;
import com.lol.service.AccountService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.List;
@Component
public class SpringDataPipeline implements Pipeline {
@Autowired
private AccountService accountService;
@Override
public void process(ResultItems resultItems, Task task) {
//获取封装好的招聘详情对象
List<Account> accounts = resultItems.get("accounts");
//判断数据是否不为空
if (accounts.size() > 0) {
//如果不为空把数据保存到数据库中
for (Account account : accounts) {
//先查找是否有重复标题的记录
List<Account> byTitle = accountService.findByTitle(account);
if (byTitle.size() == 0 || byTitle == null){
this.accountService.add(account);
}
}
}
}
}