<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>SpiderJava</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.12</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project>
package spider.pojo.po;
import lombok.Data;
import spider.common.annotation.CssSelector;
import spider.common.annotation.Html;
import java.util.List;
@Html
@Data
public class SimpleBlogHtml {
@CssSelector(selector = ".oneline span")
private List<String> title;
}
package spider.common.bean;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import spider.common.annotation.CssSelector;
import spider.pojo.po.SimpleBlogHtml;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.lang.reflect.Field;
import java.util.List;
@Slf4j
public class HtmlProcessor implements PageProcessor {
@SneakyThrows
public void process(Page page) {
page.setCharset("UTF-8");
SimpleBlogHtml blogHtml = new SimpleBlogHtml();
for(Field field: SimpleBlogHtml.class.getDeclaredFields()) {
field.setAccessible(true);
CssSelector cssSelector = field.getAnnotation(CssSelector.class);
System.out.println(cssSelector);
if(cssSelector!=null) {
String selector = cssSelector.selector();
List<String> result = page.getHtml().css(selector).xpath("//span/text()").all();
field.set(blogHtml,result);
}
}
System.out.println(blogHtml.getTitle());
page.putField("html",blogHtml);
}
private final Site site = new Site();
{
site.setCharset("UTF-8");
}
public Site getSite() {
return site;
}
}
package spider.common.bean;
import lombok.extern.slf4j.Slf4j;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
@Slf4j
public class DownloadPipeLine<T> implements Pipeline {
public void process(ResultItems resultItems, Task task) {
T data = resultItems.get("html");
log.info("data {}",data);
}
}
package spider;
import spider.common.bean.DownloadPipeLine;
import spider.common.bean.HtmlProcessor;
import spider.pojo.po.SimpleBlogHtml;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
public class Main {
public static void main(String[] args) {
Spider.create(new HtmlProcessor())
.addUrl("https://blog.csdn.net/qq_43923045")
.addPipeline(new DownloadPipeLine<SimpleBlogHtml>())
.run();
}
}