Java爬虫框架webmagic实战
本文是我关于webmagic爬虫框架的实战——爬取古诗词网站的诗词数据。此代码只用于爬虫学习,勿用于商业用途。
安装webmagic
webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
定义Article类保存诗词数据
package edu.nlp.model;
public class Article {
private int articleId;
/**
* 类型
**/
private String type;
/**
* 作者
**/
private String author;
/**
* 朝代
**/
private String dynasty;
/**
* 作者简介
**/
private String authorInfo;
/**
* 标题
**/
private String title;
/**
* 原文
**/
private String content;
/**
* 译文
**/
private String translation;
/**
* 注释
**/
private String comment;
/**
* 赏析
**/
private String appreciation;
/**
* UUID
**/
private String id;
/**
* 匹配度
**/
private float score;
public int getArticleId() {
return articleId;
}
public void setArticleId(int articleId) {
this.articleId = articleId;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getDynasty() {
return dynasty;
}
public void setDynasty(String dynasty) {
this.dynasty = dynasty;
}
public String getAuthorInfo() {
return authorInfo;
}
public void setAuthorInfo(String authorInfo) {
this.authorInfo = authorInfo;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getTranslation() {
return translation;
}
public void setTranslation(String translation) {
this.translation = translation;
}
public String getComment() {
return comment;
}
public void setComment(String comment) {
this.comment = comment;
}
public String getAppreciation() {
return appreciation;
}
public void setAppreciation(String appreciation) {
this.appreciation = appreciation;
}
public String toString() {
return "Article:{id=" + id + ",score=" + score + ",type=" + type
+ ",dynasty=" + dynasty + ",author=" + author
+ ",authorInfo=" + authorInfo + ",title=" + title + ",content="
+ content + ",translation=" + translation + ",comment=" + comment
+ ",appreciation=" + appreciation + "}";
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public float getScore() {
return score;
}
public void setScore(float score) {
this.score = score;
}
}
爬取中国诗词网的数据
以各个朝代为初始链接,爬取中国诗词网中每条诗词的所属朝代、作者信息、原文、翻译、赏析,保存每条诗词数据为json文本。
package edu.nlp.processer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import edu.nlp.model.Article;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
public class ShiWenPageProcessor implements PageProcessor {
/**
* 匹配朝代
**/
private final static String PATTER_DYNASTY =
"(xianqin|hanchao|weijin|nanbeichao|suichao|tangshi|wudai|"
+ "songci|jinchao|yuanchao|mingchao|qingchao)";
/**
* 朝代链接
**/
private final static String URL_DYNASTY =
"http://www\\.shici\\.net/" + PATTER_DYNASTY + "/$";
/**
* 作者链接
**/
private final static String URL_AUTHOR =
"http://www\\.shici\\.net/shiren/[a-z]{5}\\.html";
/**
* 诗词链接
**/
private final static String URL_ARTICLE =
"http://www\\.shici\\.net/" + PATTER_DYNASTY + "/[a-z]{5}\\.html";
/**
* 翻译链接
**/
private final static String URL_TRANSLATION =
"/fanyi/[a-z]{5}\\.html";//http://www\\.shici\\.net
/**
* 赏析链接
**/
private final static String URL_APPRECIATION =
"/shangxi/[a-z]{5}\\.html";
/**
* 文章Map,暂存Article
**/
private static Map<String, Article> articleMap =
new HashMap<String, Article>();
/**
* 保存Article
**/
private void saveArticle(Article article, Page page) {
System.out.println("诗歌:" + article);
page.putField("dynasty", article.getDynasty());
page.putField("author", article.getAuthor());
page.putField("authorInfo", article.getAuthorInfo());
page.putField("title", article.getTitle());
page.putField("content", article.getContent());
page.putField("translation", article.getTranslation());
page.putField("comment", article.getComment());
page.putField("appreciation", article.getAppreciation());
}
private Site site = Site.me().setCycleRetryTimes(5)
.setRetryTimes(5).setSleepTime(1000)
.setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
.setCharset("UTF-8");
public void process(Page page) {
if (page.getUrl().regex(URL_DYNASTY).match()) {
//System.out.println("朝代:"+page.getUrl());
//作者列表
List<String> authorUrl = page.getHtml()
.xpath("//div[@class='shirenlist']")
.links().all();
page.addTargetRequests(authorUrl);
//古诗文列表
List<String> essayUrl = page.getHtml()
.xpath("//div[@id='related']/ul")
.links().all();
page.addTargetRequests(essayUrl);
page.setSkip(true);//跳过这个页面
} else if (page.getUrl().regex(URL_AUTHOR).match()) {
//System.out.println("作者:"+page.getUrl());
//诗词列表
List<String> poemUrl = page.getHtml()
.xpath("//div[@id='related']/ul/li/a/@href")
.all();
//System.out.println(poemUrl);
page.addTargetRequests(poemUrl);
page.setSkip(true);//跳过这个页面
} else if (page.getUrl().regex(URL_ARTICLE).match()) {
//System.out.println("诗词:"+page.getUrl());
Html html = page.getHtml();
Article article = new Article();
//朝代
String dynasty = html
.xpath("//div[@id='article']/div[@class='info']")
.regex("<span>朝代:</span>(.*?)</p>").toString();
//System.out.println(dynasty);
article.setDynasty(dynasty);
//作者
String author = html
.xpath("//div[@id='article']/div[@class='info']")
.regex("<span>作者:</span><.*>(.*?)</a>").toString();
//System.out.println(author);
article.setAuthor(author);
if (!author.equals("佚名")) {
//作者简介
String authorInfo = html
.xpath("//div[@class='authorinfo']")
.regex("<br>(.*)</div>").toString();
//System.out.println(authorInfo);
article.setAuthorInfo(authorInfo);
}
//标题
String title = html.xpath("div[@id='article']/h1/text()")
.toString();
//System.out.println(title);
article.setTitle(title);
//原文
String content = html
.xpath("div[@id='article']/div[@class='content']")
.regex("<div class=\"content\">(.*)</div>")
.toString();
//System.out.println(content);
article.setContent(content);
//译文链接
String translationUrl = html
.xpath("div[@id='related']/ul/li/h3/a/@href")
.regex(URL_TRANSLATION)
.toString();
//赏析链接
String appreciateUrl = html
.xpath("div[@id='related']/ul/li/h3/a/@href")
.regex(URL_APPRECIATION)
.toString();
//System.out.println("翻译:" + translationUrl);
//System.out.println("赏析:" + appreciateUrl);
if (translationUrl == null && appreciateUrl == null) {
saveArticle(article, page);
} else {
if (translationUrl != null) {
article.setTranslation("http://www.shici.net" + translationUrl);
page.addTargetRequest("http://www.shici.net" + translationUrl);
}
if (appreciateUrl != null) {
article.setAppreciation("http://www.shici.net" + appreciateUrl);
page.addTargetRequest("http://www.shici.net" + appreciateUrl);
}
articleMap.put(page.getUrl().toString(), article);
page.setSkip(true);//跳过这个页面
}
} else if (page.getUrl().regex(URL_TRANSLATION).match()) {
Html html = page.getHtml();
String articleUrl = "http://www.shici.net" + html
.xpath("//div[@class='relatedshici']/h2/a/@href")
.toString();
System.out.println(articleUrl);
String title = html.xpath("//div[@id='article']/h1/text()").toString();
String translation = null;
String comment = null;
//处理译文与注释
if (title.endsWith("译文及注释")) {
translation = html
.xpath("//div[@id='article']/div[@class='content']")
.regex("<p><strong>译文</strong><br>(.*?)</p>")
.toString();
comment = html
.xpath("//div[@id='article']/div[@class='content']")
.regex("<p><strong>注释</strong><br>(.*?)</p>")
.toString();
} else {
if (title.endsWith("译文")) {
translation = html
.xpath("//div[@id='article']")
.regex("<div class=\"content\">(.*?)</div>")
.toString();
}
if (title.endsWith("注释")) {
comment = html
.xpath("//div[@id='article']")
.regex("<div class=\"content\">(.*?)</div>")
.toString();
}
}
System.out.println("注释:" + comment);
System.out.println("翻译:" + translation);
Article article = articleMap.get(articleUrl);
article.setTranslation(translation);
article.setComment(comment);
String appreciation = article.getAppreciation();
if (appreciation != null && appreciation.startsWith("http")) {
page.setSkip(true);//跳过这个页面
} else {
saveArticle(article, page);
articleMap.remove(articleUrl);
}
} else if (page.getUrl().regex(URL_APPRECIATION).match()) {
Html html = page.getHtml();
String articleUrl = "http://www.shici.net" + html
.xpath("//div[@class='relatedshici']/h2/a/@href")
.toString();
System.out.println(articleUrl);
String title = html.xpath("//div[@id='article']/h1").toString();
String appreciation = html
.xpath("//div[@id='article']")
.regex("<div class=\"content\">(.*?)</div>")
.toString();
System.out.println("赏析:" + title + appreciation);
Article article = articleMap.get(articleUrl);
article.setAppreciation(title + appreciation);
String translation = article.getTranslation();
if (translation != null && translation.startsWith("http")) {
page.setSkip(true);//跳过这个页面
} else {
saveArticle(article, page);
articleMap.remove(articleUrl);
}
}
}
public Site getSite() {
return site;
}
private final static String[] intiUrls = {
"http://www.shici.net/xianqin/",
"http://www.shici.net/hanchao/",
"http://www.shici.net/weijin/",
"http://www.shici.net/nanbeichao/",
"http://www.shici.net/suichao/",
"http://www.shici.net/tangshi/",
"http://www.shici.net/wudai/",
"http://www.shici.net/songci/",
"http://www.shici.net/jinchao/",
"http://www.shici.net/yuanchao/",
"http://www.shici.net/mingchao/",
"http://www.shici.net/qingchao/",
};
public static void main(String[] args) {
Spider.create(new ShiWenPageProcessor())
// .addUrl("http://www.shici.net/xianqin/")
.addUrl(intiUrls)
.addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
.thread(5)
.run();
System.out.println("运行结束");
}
}
运行程序后,生成的数据如下:
爬取好诗文网的数据
以各个朝代下各种诗文类型为初始链接(总共55个链接),爬取好诗文网中每条诗文的所属朝代、作者信息、原文、翻译、赏析,保存每条诗文数据为json文本。
package edu.nlp.processer;
import edu.nlp.model.Article;
import edu.nlp.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
//import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class HaoShiWenPageProcessor implements PageProcessor {
/**
* 开始链接
**/
private final static String URL_START = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]$";
/**
* 列表链接
**/
private final static String URL_LIST = "http://www\\.haoshiwen\\.org/type\\.php\\?c=\\d+&x=[1-5]&page=\\d+";
/**
* 诗词链接
**/
private final static String URL_ARTICLE = "/view\\.php\\?id=\\d+";
/**
* 翻译链接
**/
private final static String URL_TRANSLATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=2&id=\\d+";
/**
* 赏析链接
**/
private final static String URL_APPRECIATION = "http://www\\.haoshiwen\\.org/show\\.php\\?t=1&id=\\d+";
/**
* 暂存Article
**/
private static Map<String, Article> articleMap =
new HashMap<String, Article>();
/**
* 暂存article的类型
**/
private static Map<String, String> articleType =
new HashMap<String, String>();
/**
* 初始化开始爬取的链接
**/
private static String[] intiUrls() {
String[] urls = new String[55];
int count = 0;
for (int i = 1; i <= 11; i++) {
for (int j = 1; j <= 5; j++) {
urls[count++] = "http://www.haoshiwen.org/type.php?c=" + i + "&x=" + j;
}
}
return urls;
}
/**
* 获取article的类型
*
* @param url 开始链接或列表链接,从中提取出article类型Num
* @return
*/
private static String getType(Selectable url) {
String type = null;
int typeNum = Integer.parseInt(url.regex("c=\\d+&x=([1-5)])").toString());
switch (typeNum) {
case 1:
type = "诗";
break;
case 2:
type = "词";
break;
case 3:
type = "曲";
break;
case 4:
type = "文言文";
break;
case 5:
type = "辞赋";
break;
}
return type;
}
/**
* 保存Article
**/
private void saveArticle(Article article, Page page) {
//System.out.println("诗歌:" + article);
page.putField("articleId", article.getArticleId());
page.putField("type", article.getType());
page.putField("dynasty", article.getDynasty());
page.putField("author", article.getAuthor());
page.putField("authorInfo", article.getAuthorInfo());
page.putField("title", article.getTitle());
page.putField("content", article.getContent());
page.putField("translation", article.getTranslation());
page.putField("comment", article.getComment());
page.putField("appreciation", article.getAppreciation());
}
/**
* 配置Site
**/
private Site site = Site.me()
.setCycleRetryTimes(3)// 设置循环重试次数
.setRetryTimes(3)// 设置重试次数
.setSleepTime(100)// 设置处理page的间隔时间,单位毫秒
.setTimeOut(3000)// 设置访问url的超时时间,单位毫秒
// 设置 userAgent
.setUserAgent("Mo zilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
// 设置 header信息
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
.setCharset("UTF-8");// 设置编码
public Site getSite() {
return site;
}
/**
* 提取想要的信息
**/
public void process(Page page) {
if (page.getUrl().regex(URL_START).match()) {
//获取页数OK
String pageStr = page.getHtml()
.xpath("//div[@class='pages']")
.regex("/type.php\\?c=\\d+&x=[1-5]&page=(\\d+)\">尾页</a>")
.toString();
//System.out.println("开始:" + page.getUrl() + " 页数" + pageStr);
if (pageStr != null) {
int pageNum = Integer.parseInt(pageStr);
//System.out.println(pageNum);
List<String> pageUrl = new ArrayList<String>();
//把其余页的url添加到Request队列中
for (int i = 2; i <= pageNum; i++) {
pageUrl.add(page.getUrl() + "&page=" + i);
}
page.addTargetRequests(pageUrl);
}
//添加起始页的古诗文列表
List<String> articleUrl = page.getHtml()
.xpath("//div[@class='typeleft']/div[@class='sons']")
.regex(URL_ARTICLE)
.all();
//System.out.println("诗歌列表:" + articleUrl);
page.addTargetRequests(articleUrl);
page.setSkip(true);//跳过这个页面
//获取类型
String type = getType(page.getUrl());
for (String url : articleUrl) {
//System.out.println("诗歌链接:" + url + ", " + type);
articleType.put(url, type);
}
}
if (page.getUrl().regex(URL_LIST).match()) {
//System.out.println("列表:" + page.getUrl());
//古诗文列表
List<String> articleUrl = page.getHtml()
.xpath("//div[@class='typeleft']/div[@class='sons']")
.regex(URL_ARTICLE)
.all();
//System.out.println("诗歌列表:" + articleUrl);
page.addTargetRequests(articleUrl);
page.setSkip(true);//跳过这个页面
//获取类型
String type = getType(page.getUrl());
for (String url : articleUrl) {
articleType.put(url, type);
}
} else if (page.getUrl().regex(URL_ARTICLE).match()) {
System.out.println("诗词:" + page.getUrl());
// remove the prefix of article url
String articleUrl = page.getUrl().toString().replace("http://www.haoshiwen.org", "");
Html html = page.getHtml();
Article article = new Article();
//诗歌ID
article.setArticleId(Integer.parseInt(articleUrl.replace("/view.php?id=", "")));
//类型
article.setType(articleType.get(articleUrl));
//朝代
String dynasty = html
.xpath("//div[@class='son2']")
.regex("<span>朝代:</span>(.*?)</p>").toString();
//System.out.println("朝代:" + dynasty);
article.setDynasty(dynasty);
//作者
String author = html
.xpath("//div[@class='son2']")
.regex("<span>作者:</span>(.*?)</p>")
.toString().replaceAll("</?a.*?>", "");
//System.out.println("作者" + author);
article.setAuthor(author);
if (!author.equals("佚名")) {
//作者简介
String authorInfo = html
.regex("<div class=\"son5\" style=\"overflow:auto;\">" +
".*<img.*></a>(.*)<a.*?>\\.▶</a>")
.toString();
//System.out.println("作者简介:"+authorInfo);
if (authorInfo != "0") {
article.setAuthorInfo(authorInfo);
}
}
//标题
String title = html.xpath("div[@class='son1']/h1/text()")
.toString();
//System.out.println("标题:"+title);
article.setTitle(title);
//原文
String content = html
.xpath("//div[@class='shileft']/div[@class='son2']")
.regex("<p style=\"margin\\-top:0px;\">\\ </p>\\s+(.*?)<br>\\s+" +
"<strong><span style=\"color:#FFFFFF;background-color:#E53333;\">精彩推荐</span></strong>")
.toString();
//System.out.println("原文:" + content);
article.setContent(content);
//译文链接
String translateUrl = html
.xpath("div[@class='son5']").links()
.regex(URL_TRANSLATION).toString();
//赏析链接
String appreciationUrl = html
.xpath("div[@class='son5']").links()
.regex(URL_APPRECIATION).toString();
//System.out.println("翻译:" + translateUrl);
//System.out.println("赏析:" + appreciationUrl);
if (translateUrl == null && appreciationUrl == null) {
//如果没有译文和赏析,则直接保存该Article对象
saveArticle(article, page);
} else {
//否则,则把Article存在articleMap,等待信息被补齐才保存
if (translateUrl != null) {
article.setTranslation(translateUrl);
page.addTargetRequest(translateUrl);
}
if (appreciationUrl != null) {
article.setAppreciation(appreciationUrl);
page.addTargetRequest(appreciationUrl);
}
articleMap.put(articleUrl, article);
page.setSkip(true);//跳过这个页面
}
} else if (page.getUrl().regex(URL_TRANSLATION).match()) {
Html html = page.getHtml();
String articleUrl = html
.xpath("//div[@class='sontitle']/span/a/@href")
.toString();
//System.out.println("诗词的链接"+articleUrl);
//翻译标题
String translationTitle = html
.xpath("//div[@class='shileft']/div[@class='son1']/h1/text()")
.toString();
//System.out.println("翻译标题:" + translationTitle);
String translation = null;
String comment = null;
if (translationTitle.endsWith("译文及注释")) {
translation = html
.xpath("//div[@class='shangxicont']")
.regex("<p><strong>译文.*?</strong>(.*?)</p>")
.toString();
if (translation != null)//去掉无关内容
translation = translation.replaceAll("</?a.*?>", "");
comment = html
.xpath("//div[@class='shangxicont']")
.regex("<p><strong>注释.*?</strong>(.*?)</p>")
.toString();
if (comment != null)
comment = comment.replaceAll("</?a.*?>", "");
if (translation == null && comment == null) {
//译文和注释被合并在了一起
translation = html
.xpath("//div[@class='shangxicont']")
.regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (translation != null)
translation = translation.replaceAll("</?a.*?>", "");
}
} else {
//只有译文
if (translationTitle.endsWith("译文")) {
translation = html
.xpath("//div[@class='shangxicont']")
.regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (translation != null)//去掉无关内容
translation = translation.replaceAll("</?a.*?>", "");
}
//只有注释
if (translationTitle.endsWith("注释")) {
comment = html
.xpath("//div[@class='shangxicont']")
.regex("<p>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (comment != null) {
comment = comment.replaceAll("</?a.*?>", "");
}
}
}
//System.out.println("翻译:" + translation);
//System.out.println("注释:" + comment);
Article article = articleMap.get(articleUrl);
article.setTranslation(translation);
article.setComment(comment);
String appreciation = article.getAppreciation();
if (appreciation != null && appreciation.startsWith("http")) {
page.setSkip(true);//跳过这个页面
} else {
saveArticle(article, page);
articleMap.remove(articleUrl);
}
} else if (page.getUrl().regex(URL_APPRECIATION).match()) {
Html html = page.getHtml();
String articleUrl = html
.xpath("//div[@class='sontitle']/span/a/@href")
.toString();
//System.out.println("诗词的链接" + articleUrl);
String appreciateTitle = html
.xpath("//div[@class='shileft']/div[@class='son1']/h1")
.toString();
//System.out.println(appreciateTitle);
String appreciation = html
.xpath("//div[@class='shangxicont']")
.regex("<p.*>\\s?作者[::]佚名\\s?</p>(.*?)<p style=")
.toString();
if (appreciation != null)
appreciation = appreciation.replaceAll("</?a.*?>", "");
//System.out.println("赏析:" + appreciation);
Article article = articleMap.get(articleUrl);
article.setAppreciation(appreciateTitle + appreciation);
String translation = article.getTranslation();
if (translation != null && translation.startsWith("http")) {
page.setSkip(true);//跳过这个页面
} else {
saveArticle(article, page);
articleMap.remove(articleUrl);
}
}
}
/**
* 获取article总数
**/
public static int articleCount() {
return articleType.size();
}
public static void main(String[] args) {
HaoShiWenPageProcessor processor = new HaoShiWenPageProcessor();
Spider.create(processor)//指定PageProcessor页面处理器
.addUrl(intiUrls())//添加爬取链接
//.addUrl("http://www.haoshiwen.org/view.php?id=47834")
//指定Pipeline结果处理对象,这里把结果保存成JSON文件
// 默认保存到/data/webmagic,这里保存数据到/data下
// 使用自定义的jsonFilePipeline来保存json数据,以诗歌ID来命名json文件
// 默认的jsonFilePipeline是以链接的Url经过md5加密后来命名json文件的,如果重复下载的话,会出现重复的文件
.addPipeline(new JsonFilePipeline("/Users/liaoxuyi/Desktop/data"))
.thread(5)//指定线程数
.run();//开始爬虫
System.out.println("诗词总数有:" + processor.articleCount());//75604
System.out.println("运行结束");
}
}
自定义JsonFilePipe保存json数据
由于使用webmagic默认的JsonFilePipe生成的json文件的文件名是使用MD5对文件进行命名的,生成的json文件无法从文件名上和网站的诗文链接进行一一对应,所以这里自定义JsonFilePipe,设置保存的json文件名为诗文的ID,方便查找原始的诗文内容。
package edu.nlp.pipeline;
import com.alibaba.fastjson.JSON;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(this.getClass());
public JsonFilePipeline() {
this.setPath("/data/webmagic");
}
public JsonFilePipeline(String path) {
this.setPath(path);
}
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
// 用articleId来命名文件名
PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + resultItems.get("articleId") + ".json")));
//PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (IOException var5) {
this.logger.warn("write file error", var5);
}
}
}
运行程序后,生成的数据如下: