目标:获取百度百科基本信息、信息列表、人物图片(同名情况暂不考虑)。
重点:调用开源Jar包Jsoup对HTML解析。网页抓取、简单爬虫。
例子(部分类去掉,运行需改改code):
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonProcessingException;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BaiduBaikeUtils
{
private static Logger logger = LoggerFactory.getLogger(BaiduBaikeUtils.class);
final static String BAIKE_SEARCH = "http://baike.baidu.com/search/word?word=";
final static String DOUBLE_QUOTE = "\"";
final static String NON_WORD_CHAR = "[^\\p{L}\\p{Nd}]+";
/*
* 获取百科Url
*/
private static String getBaikeSearchUrl(String name){
String url = null;
url = BAIKE_SEARCH + name;
return url;
}
private static Document getDocument(String url){
Document doc = null;
try {
doc = Jsoup.connect(url).timeout(5000).get();
} catch (IOException e) {
logger.error("error to connect to Baidu baike" + e.getClass().getName() + ": " + e.getMessage());
}
return doc;
}
/*
* 获取具体信息列表
*/
private static WeiboPerson parseBaseInfoWrap(Document doc){
final String BASE_INFO_WRAP = "baseInfoWrapDom";
WeiboPerson weiboPerson = new WeiboPerson();
Map<String, String> biTitleMap = setBiTitleMap();
if (doc == null) return null;
if (null != doc.getElementById(BASE_INFO_WRAP)) {
Elements elements = doc.getElementById(BASE_INFO_WRAP).getElementsByClass("biItemInner");
if (null != elements && elements.size() > 0) {
for (Element element : elements) {
setWeiboPerson(weiboPe