注意:下面的方法在csdn博客改版以后无法使用,因为现在csdn博客不支持metadata api,不知道什么时候可以支持。
1.原文连接
http://hi.baidu.com/cnjsp/blog/item/e175cf1b27bc6af6ae513335.html
2.心得
本方法我测试过,是可以用来的,一则感觉思路挺新颖了,程序员自己写代码解决自己的事情。另一个可以通过这个实例学习一下java,所以我贴出我修改后的java代码。
具体思路可以参见原文。
3.代码
CSDNPost.java
package cn.mingyuan.baidu2csdn.core;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
/**
* csdn博文
*
* @author mingyuanonline@gmail.com
*
*/
public class CSDNPost {
/**
* 博文创建日期
*/
private Date dateCreated;
/**
* 博文内容
*/
private String description;
/**
* 标题
*/
private String title;
/**
* 博文分类
*/
private String[] categories;
public CSDNPost() {
}
public CSDNPost(String title, String description, String[] categories,
Date dateCreated) {
this.dateCreated = dateCreated;
this.description = description;
this.title = title;
this.categories = categories;
}
public Date getDateCreated() {
return dateCreated;
}
public void setDateCreated(Date dateCreated) {
this.dateCreated = dateCreated;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String[] getCategories() {
return categories;
}
public void setCategories(String[] categories) {
this.categories = categories;
}
/**
* xml-rpc配置
*/
private static XmlRpcClientConfigImpl config;
/**
* xml-rpcClient
*/
private static XmlRpcClient client;
static {
config = new XmlRpcClientConfigImpl();
try {
// 此处请将telnetor替换为您的用户名
config.setServerURL(new URL(
"http://blog.csdn.net/xw13106209/services/metablogapi.aspx"));
} catch (MalformedURLException e) {
System.out.println("请检查url");
}
client = new XmlRpcClient();
client.setConfig(config);
}
/**
* 日志记录
*
* @param log
* log
*/
private void writelog(String log) {
FileOutputStream fos = null;
try {
fos = new FileOutputStream("post.log", true);
fos.write((log + "\r\n").getBytes());
fos.flush();
fos.close();
} catch (IOException e) {
System.out.println("写入日志错误:" + log);
}
}
/**
* 发布
*/
public void publish() {
Map<String, Object> struct = new HashMap<String, Object>();
struct.put("dateCreated", dateCreated);
struct.put("description", description);
struct.put("title", title);
struct.put("categories", categories);
// Object[] params = new Object[] { "your usrname",
// "replace it with your username",
// "replace it with your password", struct, true };
Object[] params = new Object[] { "xw13106209",
"xw13106209",
"password", struct, true };
String blogid = null;
try {
blogid = (String) client.execute("metaWeblog.newPost", params);
} catch (XmlRpcException e) {
writelog("导入出现错误:title=" + title);
System.out.println("导入出现错误:title=" + title);
}
writelog(title + ">> 导入完毕,生成博文id为>>" + blogid);
System.out.println(title + ">> 导入完毕,生成博文id为>>" + blogid);
struct.clear();
}
public static void main(String[] args) {
CSDNPost post = new CSDNPost();
post.publish();
}
}
BaiduHi
package cn.mingyuan.baidu2csdn.core;
import java.util.Date;
/**
* 百度博客
*
* @author mingyuanonline@gmail.com
*
*/
public class BaiduHi {
/**
* 标题
*/
private String title;
/**
* 内容
*/
private String description;
/**
* 分类
*/
private String categories;
/**
* 发布日期
*/
private Date dateCreated;
public String getTitle() {
return title;
}
public String getDescription() {
return description;
}
public String getCategories() {
return categories;
}
public Date getDateCreated() {
return dateCreated;
}
public void setTitle(String title) {
this.title = title;
}
public void setDescription(String description) {
this.description = description;
}
public void setCategories(String categories) {
this.categories = categories;
}
public void setDateCreated(Date dateCreated) {
this.dateCreated = dateCreated;
}
public BaiduHi(String title, String description, String categories,
Date dateCreated) {
this.title = title;
this.description = description;
this.categories = categories;
this.dateCreated = dateCreated;
}
public BaiduHi() {
// TODO Auto-generated constructor stub
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}
BaiduHiFetcher
package cn.mingyuan.baidu2csdn.core;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 百度博客数据抓取及解析
*
* @author mingyuanonline@gmail.com
*
*/
public class BaiduHiFetcher {
/**
* 下载页面
*
* @param url
* url
* @return 网页源码
*/
private String downloadPage(String url) {
URLConnection conn;
InputStream in;
BufferedReader reader = null;
StringBuilder sb = new StringBuilder();
String line = null;
try {
conn = new URL(url).openConnection();
in = conn.getInputStream();
reader = new BufferedReader(new InputStreamReader(in, "gb2312"));
while ((line = reader.readLine()) != null) {
sb.append(line);
}
in.close();
reader.close();
} catch (MalformedURLException e) {
System.out.println("请检查url是否规范");
} catch (IOException e) {
System.out.println("读取源码错误:" + url);
}
return sb.toString();
}
/**
* 获取页面博文链接
*
* @param html
* 网页源码
* @return 页面中的博文链接
*/
private List<String> getPostLinks(String html) {
// 分析页面内容,取得页面中的文章链接
String titleDivRegex = "<div[\\s]class=\"tit\"><a[\\s]href=[^<>]+?target=\"_blank\">.+?</div>";
Pattern titleDivPattern = Pattern.compile(titleDivRegex);
Matcher titleDivMatcher = titleDivPattern.matcher(html);
List<String> posts = new ArrayList<String>();
while (titleDivMatcher.find()) {
String div = titleDivMatcher.group();
String titleUrl = div.substring(div.indexOf("/"), div
.indexOf("\" target"));
posts.add("http://hi.baidu.com" + titleUrl);
}
return posts;
}
/**
* <p>
* 获取博客总页数 <br>
* 我的博客内容有16页,有上一页,下一页,尾页等这样的标志,如果博文少的话可能这些标志不会出现,请修改此方法
*
* @param html
* 源码(最好是第一页)
* @return 博客总页数
*/
private int getTotalPages(String html) {
// 页码
// <a href="/cnjsp/blog/index/16"
// mce_href="cnjsp/blog/index/16">[尾页]</a>
String pageRegex = "<a[\\s]href=\"/cnjsp/blog/index/[\\d][\\d]\">\\[尾页\\]</a>";
Pattern pagePattern = Pattern.compile(pageRegex);
Matcher pageMatcher = pagePattern.matcher(html);
String totalPagesStr = null;
int pages = 0;
if (pageMatcher.find()) {
String pagelink = pageMatcher.group();
totalPagesStr = pagelink.replaceAll(
"<a[\\s]href=\"/cnjsp/blog/index/", "").replaceAll(
"\">\\[尾页\\]</a>", "");
pages = Integer.parseInt(totalPagesStr);
}
return pages;
}
/**
* <p>
* 获取博客的所有博文的地址 <br>
* 没有对url进行编码处理,如果博客地址含中文,请对url进行处理
*
* @param blogUrl
* 博客地址
* @return 所有博文地址,存放于栈中,使用的时候请使用pop方法取出元素,这样可以保证按照最先发表的博文最先处理
*/
public Stack<String> getAllPostLink(String blogUrl) {
Stack<String> posts = new Stack<String>();
// 1.下载第一页
String firstPageHtml = downloadPage(blogUrl + "/blog/index/0");
// 2.获取博文总页数
// int totalPages = getTotalPages(firstPageHtml);
int totalPages = 2;
// 3.下载各摘要页
posts.addAll(getPostLinks(firstPageHtml));
if (totalPages < 1) {
return posts;
}
for (int i = 1; i <= totalPages; i++) {
String page = downloadPage(blogUrl + "/blog/index/" + i);
posts.addAll(getPostLinks(page));
}
return posts;
}
/**
* 解析博文,获取标题,发布时间,内容,分类等信息
*
* @param postUrl
* 博文地址
* @return 封装了博文信息的BaiduHi
*/
public BaiduHi getBaiduHi(String postUrl) {
String html = downloadPage(postUrl);
// /<div class="tit">
String titleDivRegex = "<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">.+?</div><div[\\s]class=\"date\">";
Pattern titleDivPattern = Pattern.compile(titleDivRegex);
Matcher titleDivMatcher = titleDivPattern.matcher(html);
String title = null;
if (titleDivMatcher.find()) {
title = titleDivMatcher
.group()
.replaceAll(
"<div[\\s]id=\"m_blog\"[\\s]class=\"modbox\"[\\s]style=\"overflow-x:hidden;\"><div[\\s]class=\"tit\">",
"")
.replaceAll("</div><div[\\s]class=\"date\">", "").trim();
}
String dateDivRegex = "<div[\\s]class=\"date\">.+?</div>";
Pattern dateDivPattern = Pattern.compile(dateDivRegex);
Matcher dateMatcher = dateDivPattern.matcher(html);
String dateStr = null;
Date postDate = null;
if (dateMatcher.find()) {
dateStr = dateMatcher.group().replaceAll(
"<div[\\s]class=\"date\">", "").replaceAll("</div>", "")
.trim();
postDate = getDate(dateStr);
}
String textDivRegex = "<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>.+?</div>";
Pattern textDivPattern = Pattern.compile(textDivRegex);
Matcher textMatcher = textDivPattern.matcher(html);
String text = null;
if (textMatcher.find()) {
text = textMatcher.group().replaceAll(
"<div[\\s]id=\"blog_text\"[\\s]class=\"cnt\"[\\s]+>", "")
.replaceAll("</div>", "").trim();
}
String categoriesRegex = "title=\"查看该分类中所有文章\">类别:.+?</a>";
Pattern categoriesDivPattern = Pattern.compile(categoriesRegex);
Matcher categoriesMatcher = categoriesDivPattern.matcher(html);
String categories = null;
if (categoriesMatcher.find()) {
categories = categoriesMatcher.group().replaceAll(
"title=\"查看该分类中所有文章\">类别:", "").replaceAll("</a>", "")
.trim();
}
BaiduHi hi = new BaiduHi();
hi.setTitle(title);
hi.setDescription(text);
hi.setCategories(categories);
hi.setDateCreated(postDate);
return hi;
}
/**
* 解析博文中的日期格式返回Date类型
* 日期格式为:2011年07月01日 星期五 下午 01:05
* @param str
* 博文中的日期
* @return Date类型日期
*/
@SuppressWarnings("deprecation")
private Date getDate(String str) {
String yearStr = str.substring(0, str.indexOf("年")).trim();
String monthStr = str.substring(str.indexOf("年"), str.indexOf("月"))
.replace("年", "").trim();
String dayStr = str.substring(str.indexOf("月"), str.indexOf("日"))
.replace("月", "").trim();
String timeStr = str.substring(str.indexOf("午")).replace("午", "")
.trim();
String hourStr = timeStr.split(":")[0];
String minutesStr = timeStr.split(":")[1];
Date date = new Date();
date.setYear(Integer.parseInt(yearStr) - 1900);
date.setMonth(Integer.parseInt(monthStr) - 1);
date.setDate(Integer.parseInt(dayStr));
if (str.contains("下午")) {
date.setHours(Integer.parseInt(hourStr) + 12);
} else {
date.setHours(Integer.parseInt(hourStr));
}
date.setMinutes(Integer.parseInt(minutesStr));
return date;
}
}
Transfer
package cn.mingyuan.baidu2csdn.core;
import java.util.Stack;
/**
* 搬家
*
* @author mingyuanonline@gmail.com
*
*/
public class Transfer {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//String postUrl = "http://hi.baidu.com/cnjsp";
String postUrl = "http://hi.baidu.com/xwdreamer";
BaiduHiFetcher fetcher = new BaiduHiFetcher();
Stack<String> urls = null;
urls = fetcher.getAllPostLink(postUrl);
while (!urls.isEmpty()) {
String url = urls.pop();
BaiduHi hi = null;
hi = fetcher.getBaiduHi(url);
CSDNPost post = new CSDNPost();
post.setTitle(hi.getTitle());
post.setDescription(hi.getDescription());
post.setCategories(new String[] { hi.getCategories() });
post.setDateCreated(hi.getDateCreated());
post.publish();
try {
Thread.sleep(5 * 1000);
} catch (InterruptedException e) {
System.out.println("休眠出错");
}
}
}
}
package cn.mingyuan.baidu2csdn.core;
import java.util.Stack;
/**
* 搬家
*
* @author mingyuanonline@gmail.com
*
*/
public class Transfer {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//String postUrl = "http://hi.baidu.com/cnjsp";
String postUrl = "http://hi.baidu.com/xwdreamer";
BaiduHiFetcher fetcher = new BaiduHiFetcher();
Stack<String> urls = null;
urls = fetcher.getAllPostLink(postUrl);
while (!urls.isEmpty()) {
String url = urls.pop();
BaiduHi hi = null;
hi = fetcher.getBaiduHi(url);
CSDNPost post = new CSDNPost();
post.setTitle(hi.getTitle());
post.setDescription(hi.getDescription());
post.setCategories(new String[] { hi.getCategories() });
post.setDateCreated(hi.getDateCreated());
post.publish();
try {
Thread.sleep(5 * 1000);
} catch (InterruptedException e) {
System.out.println("休眠出错");
}
}
}
}
DeletePostById
package cn.mingyuan.baidu2csdn.core;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
public class DeletePostById {
private static XmlRpcClientConfigImpl config;
private static XmlRpcClient client;
static {
config = new XmlRpcClientConfigImpl();
try {
config.setServerURL(new URL(
"http://blog.csdn.net/telnetor/services/metablogapi.aspx"));
} catch (MalformedURLException e) {
System.out.println("请检查url");
}
client = new XmlRpcClient();
client.setConfig(config);
}
/**
* 删除帖子
*
* @param appkey
* appkey,可以任意,这是一个忽略的值
* @param postid
* 帖子id
* @param username
* 用户名
* @param password
* 密码
* @param publish
* 博客在帖子被删除之后是否重新发布
*/
public static void delete(String appkey, String postid, String username,
String password, boolean publish) {
Object[] params = new Object[] { "ignored value", postid, username,
password, true };
try {
client.execute("blogger.deletePost", params);
} catch (XmlRpcException e) {
System.out.println("删除出错,postid=" + postid);
}
System.out.println(postid + "删除完毕");
}
/**
* @param args
* @throws InterruptedException
*/
public static void main(String[] args) throws InterruptedException {
BufferedReader reader = null;
String line;
try {
reader = new BufferedReader(new InputStreamReader(
new FileInputStream("content")));
while ((line = reader.readLine()) != null) {
line = line.split("生成博文id为:")[1];
delete("ignored", line, "your username", "your password", true);
Thread.sleep(1000 * 10);
}
} catch (FileNotFoundException e1) {
System.out.println("文件没找到");
} catch (IOException e) {
System.out.println("读取文件失败");
}
}
}