webmagic爬取页面
这里举一个简单的例子,爬取一下代理网站的ip和端口号
免费代理
因为分页很简单,所以就没有使用正则表达式了;
public class JobPageProcessor implements PageProcessor {
Integer i=1;
List<proxyExcel> lists=new ArrayList<>();
@Override
public void process(Page page) {
Html html = page.getHtml();
List<Selectable> nodes = html.css("#content > section > div.container > table > tbody > tr").nodes();
nodes.forEach(e->{
String ip = e.$("td:nth-child(1)", "text").get();
String port = e.$("td:nth-child(2)", "text").get();
lists.add(new proxyExcel(ip,port));
page.putField("results",lists);
System.out.println(ip+" "+port);
});
i++;
page.addTargetRequest("https://proxy.ip3366.net/free/?action=china&page="+i);
}
@Override
public Site getSite() {
return Site.me();
}
}
使用excel4j
导包
<dependency>
<groupId>com.github.crab2died</groupId>
<artifactId>Excel4J</artifactId>
<version>3.0.0-Alpha</version>
</dependency>
定义一个类
import com.github.crab2died.annotation.ExcelField;
public class proxyExcel {
@ExcelField(title = "ip")
private String ip;
@ExcelField(title = "port")
private String port;
public proxyExcel(String ip, String port) {
this.ip = ip;
this.port = port;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getPort() {
return port;
}
public void setPort(String port) {
this.port = port;
}
@Override
public String toString() {
return "proxyExcel{" +
"ip='" + ip + '\'' +
", port='" + port + '\'' +
'}';
}
}
自定义Pipeline
public class ExcelPipeline implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
private Workbook workbook;
private Sheet sheet;
private int rowIndex;
@Override
public void process(ResultItems resultItems, Task task) {
List results = (List)resultItems.get("results");
System.out.println(results);
try {
ExcelUtils.getInstance().exportObjects2Excel(results, proxyExcel.class, true, null, true, "F:/B.xlsx");
} catch (Excel4JException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
这样就可以存到excel中啦!