import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class BaiduParse {
public static void main(String[] args) throws Exception {
String str = "http://tieba.baidu.com/p/1303669256";
String endPage = getEndPage(str);
String url = null;
for (int i = 1; !endPage.equals(url); i++) {
url = str + "?pn=" + i;
System.out
.println("================================================"
+ url
+ "================================================");
String content = getContent(url);
writeStringToFile(fileName(url) + ".html", content, "gb2312");
}
}
/**
* 保存到本地的文件名称
*
* @param name
* 文件名称
* @return
*/
public static String fileName(String name) {
String abc = name.split("/")[name.split("/").length - 1];
String aaa = abc.split("\\?")[0]
+ abc.split("\\?")[abc.split("\\?").length - 1];
return aaa;
}
/**
* 找到最后一页的URL
*
* @param url
* @return
* @throws ParserException
*/
public static String getEndPage(String url) throws ParserException {
String str = null;
Parser parser = new Parser();
parser.setURL(url);
parser.setEncoding("gb2312");
// 设置过滤器,只获取li标签,并且只有class属性为l_pager pager_theme_2的html节点(包括子节点)
NodeFilter beginNodeFilter = new AndFilter(new TagNameFilter("li"),
new HasAttributeFilter("class", "l_pager pager_theme_2"));
NodeList nodeList = parser.extractAllNodesThatMatch(beginNodeFilter);
if (nodeList != null && nodeList.size() > 0) {
Node nameNode = nodeList.elementAt(nodeList.size() - 1);
//得到最后一个链接
LinkTag n = (LinkTag) nameNode.getLastChild();
str = n.extractLink();
}
parser.reset();
return str;
}
/**
* 获取某个URL中的内容,这里只留下了br标签
*
* @param url
* @return
* @throws ParserException
*/
public static String getContent(String url) throws ParserException {
Parser parser = new Parser();
parser.setURL(url);
parser.setEncoding("gb2312");
// 设置过滤器,只获取p标签,并且只有class属性为d_post_content的html节点(包括子节点)
NodeFilter beginNodeFilter = new AndFilter(new TagNameFilter("p"),
new HasAttributeFilter("class", "d_post_content"));
// 执行解析得到所有节点集合
NodeList nodeList = parser.extractAllNodesThatMatch(beginNodeFilter);
StringBuffer sb = new StringBuffer();
for (int i = 0; i < nodeList.size(); i++) {
Node nameNode = nodeList.elementAt(i);
NodeList cNodeList = nameNode.getChildren();
// 自定义解析器把包含a标签与包含img标签的节点去掉
NodeFilter nodeFilter = new NodeFilter() {
private static final long serialVersionUID = 1L;
public boolean accept(Node arg0) {
// 如果包含a或者包含img跳过
if (arg0.toHtml().startsWith("<a")
|| arg0.toHtml().startsWith("<img")) {
return false;
}
return true;
}
};
//对子节点进行a标签img标签过滤
cNodeList = cNodeList.extractAllNodesThatMatch(nodeFilter);
for (int j = 0; j < cNodeList.size(); j++) {
Node cnameNode = cNodeList.elementAt(j);
sb.append(cnameNode.toHtml());
}
}
parser.reset();
return sb.toString();
}
/**
* 把字符串写入文件中
*
* @param fileName
* 文件名称
* @param content
* 文件内容
* @param enc
* 字符集编码
* @return
* @throws IOException
*/
public static boolean writeStringToFile(String fileName, String content,
String enc) throws IOException {
File file = new File(fileName);
try {
if (file.isFile()) {
file.deleteOnExit();
file = new File(file.getAbsolutePath());
}
OutputStreamWriter os = null;
if (enc == null || enc.length() == 0) {
os = new OutputStreamWriter(new FileOutputStream(file));
} else {
os = new OutputStreamWriter(new FileOutputStream(fileName), enc);
}
os.write(content);
os.close();
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
}