最近做了一个网站www.ifunit.com,自己偷懒,采集了csdn的文章,采集的入口是从csdn搜索进入,以下是主要的采集代码
HttpResponse response = req.getMethod() == Method.GET ? HttpUtil.httpGet(req.getUrl(), req.getHeaderMap())
: HttpUtil.httpPost(req.getUrl(), req.getFormParam(), req.getHeaderMap());
String html = HttpUtil.response2Html(response);
Document doc = Jsoup.parse(html, req.getUrl());
UrlListResponse rep = new UrlListResponse();
List<String> list = new ArrayList<String>();
Elements urlElements = doc.body().select("h3.rt a");//获取url
for (Element element : urlElements) {
list.add(element.attr("href"));
}
List<String> all = WpJdbc.queryUrls();//查询已经采集的
list.removeAll(all);
rep.setList(list);
return rep;
public BlogResponse parser(Request req) {
HttpResponse response = req.getMethod() == Method.GET ? HttpUtil.httpGet(req.getUrl(), req.getHeaderMap())
: HttpUtil.httpPost(req.getUrl(), req.getFormParam(), req.getHeaderMap());
String html = HttpUtil.response2Html(response);
Document doc = Jsoup.parse(html, req.getUrl());
String keywords = "";
String description = "";
Elements metas = doc.select("meta");
if (metas != null && metas.size() > 0) {
for (Element meta : metas) {
if ("keywords".equalsIgnoreCase(meta.attr("name"))) {
keywords = meta.attr("content");
} else if ("Description".equalsIgnoreCase(meta.attr("name"))) {
description = meta.attr("content");
}
}
}
Element body = doc.body();
Elements titleEs = body.select("h1 span.link_title a");
titleEs.select("font").remove();
String title = titleEs.text();
Elements content = body.select(".article_content");
Elements tags = body.select(".tag2box a");
Set<String> blogTags = new LinkedHashSet<String>();
if (tags != null && tags.size() > 0) {
for (Element tag : tags) {
blogTags.add(tag.text());
}
}
BlogResponse blog = new BlogResponse();
blog.setTitle(title);
blog.setContent(getContent(content, req));
blog.setKeywords(keywords);
blog.setDescription(description);
blog.setTags(blogTags);
blog.setAuthor("csdn");
blog.setLink(req.getUrl());
return blog;
}
/**
* 处理文章内容
*
* @param content
* @param req
*/
private String getContent(Elements content, Request req) {
Elements pres = content.select("pre[name='code']");
Map<String, String> map = new HashMap<String, String>();
int index = 1;
for (Element pre : pres) {
//pres
String lang = pre.attr("class");
if (lang != null) {
try {
String h = pre.html().replace(" ", " ").replace("<", "<").replace(">", ">")
.replace("&", "&").replace(""", "\"");
String langHtml = "\n\n[" + lang + "]\n" + h + "\n[/" + lang + "]";
String random = "<!-- #" + lang + "*" + index + "*" + lang + "# -->";
map.put(random, langHtml);
pre.before(random);
index++;
pre.remove();
}
catch (Exception e) {
log.warn("获取代码语言出错", e);
continue;
}
}
}
//图片处理
Elements imgs = content.select("img");
for (Element img : imgs) {
String src = img.attr("src");
if (src != null && !src.toLowerCase().startsWith("javascript:")) {
String imgSrc = "";
if (src.startsWith("http")) {
imgSrc = src;
} else {
//以下方法对相对路径进行转换
try {
URL hostUrl = new URL(req.getUrl());
URL imgUrl = new URL(hostUrl, src);
imgSrc = imgUrl.toString();
}
catch (MalformedURLException e) {
log.warn("图片url转换异常", e);
continue;
}
}
Request imgReq = new Request(imgSrc);
try {
String newSrc = ImageUtil.downLoad(imgReq);
img.attr("src", newSrc);
if ("a".equalsIgnoreCase(img.parent().tagName())) {
Element a = img.parent();
if (src.equals(a.attr("href")) || imgSrc.equals(a.attr("href"))) {
a.attr("href", newSrc);
}
}
}
catch (Exception e) {
log.warn("下载图片错误", e);
}
}
}
String html = content.html();
for (String key : map.keySet()) {
html = html.replace(key, map.get(key));
}
return html;
}
由于www.ifunit.com采用的是wordpress,直接使用xmlrpc就可以了