从http://webmagic.io/download.html下载的所有依赖jar包比较多,全部导入已有项目后容易出现冲突,可以只导入下面几个jar包
然后写一个公用的方法供调用
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class WebMagic implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
private static String issue;
private static String jczq;
@Override
public void process(Page page) {
page.putField("content",page.getHtml().xpath("//span[@id='bet_period']/text()").toString());
if (page.getResultItems().get("content")==null){
//skip this page
page.setSkip(true);
}
//获得所需的内容
issue = page.getHtml().xpath("//span[@id='bet_period']/text()").toString();
//div[@class='dataBody unAttention']/dl/dt/text()
//表示类名为dataBody unAttention的div元素下的dl元素下的dt元素的文本内容
jczq = page.getHtml().xpath("//div[@class='dataBody unAttention']/dl/dt/text()").toString()
+ page.getHtml().xpath("//div[@class='dataBody unAttention']/dl/dt/span/text()").toString();
}
@Override
public Site getSite() {
return site;
}
public static String issue(String url) {
Spider.create(new WebMagic()).addUrl(url)
.addPipeline(new ConsolePipeline()).thread(5).run();
return issue;
}
public static String jczq(String url) {
Spider.create(new WebMagic()).addUrl(url)
.addPipeline(new ConsolePipeline()).thread(5).run();
return jczq;
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
在其他类中中通过
String issue = WebMagic.issue("http://caipiao.163.com/order/dlt/");
String jczq = WebMagic.jczq("http://caipiao.163.com/order/preBet_jczqspfmixp.html");
- 1
- 2
可调用爬虫,得到爬取的值