请尊重原创,转载请注明出处:http://my.oschina.net/u/1789904/blog/386576
核心:htmlparser框架
HtmlParser爬取搜狗百科名人数据:
/**
* 从百科搜索中获取百科地址
* @param url
* @param charset
* @param timeOut
* @return
* @throws IOException
*/
private Map parserBaike(String url, String charset, int timeOut) throws IOException {
WebHttpClient util=new WebHttpClient();
String content=util.getWebContentByGet(url,charset,timeOut);
if(content == null){
return null;
}
Map map = new HashMap<>();
Map subMap = new HashMap<>();
try {
//开始解析
Node node = null;
/********************* 解析名字 **********************/
// 过滤出class为term的元素
Parser parser = Parser.createParser(content, charset);
AndFilter filter = new AndFilter(new TagNameFilter("h1"), new HasAttributeFilter("id","title"));
NodeList nodeList = parser.parse(filter);
for (int i = 0; i
node = nodeList.elementAt(i);
map.put("name", node.toPlainTextString().trim());
}
/********************* 解析简介 **********************/
// 过滤出class为start-time的元素
Parser parser2 = Parser.createParser(content, charset);
AndFilter filter2 = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class","abstract"));
NodeList nodeList2 = parser2.parse(filter2);
for (int i = 0; i
node = nodeList2.elementAt(i);
String name = node.toPlainTextString().trim();
System.out.println("name:" + name);
map.put("intro", name);
}
// 过滤出id为J_SingleEndTimeLabel的元素
Parser parser3 = Parser.createParser(content, charset);
AndFilter filter3 = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class",""));
NodeList nodeList3 = parser3.parse(filter3);
for (int i = 0; i
node = nodeList3.elementAt(i);
String imgUrl = findHttp(node.toHtml());
System.out.println("imgUrl:" + imgUrl);
map.put("logo", imgUrl);
}
/********************* 解析表格数据 **********************/
// 过滤出class为box post的
Parser parser4 = Parser.createParser(content, charset);
//AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_tbl"));
AndFilter andFilter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","abstract_list"));
NodeList tableList = parser4.extractAllNodesThatMatch(andFilter);
System.out.println("tableList.size:" + tableList.size());
//tableList.size() 有两个tableList
for (int i=0; i
TableTag table = (TableTag) tableList.elementAt(i);
//取得表中的行集
TableRow[] rows = table.getRows();
//遍历每行
for (int r=0; r
TableRow tr = rows[r];
//行中的列和标题
TableColumn[] td = tr.getColumns();
TableHeader[] header =tr.getHeaders();
System.out.println("td.length:" + td.length);
for (int c=0; c
String head = header[c].toPlainTextString();
String col = td[c].toPlainTextString().trim();
if (head.equals("出生地")) {
System.out.println("======出生地:" + col);
map.put("home", col);
}
subMap.put(head, col);
System.out.println(head + ":" + col);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
map.put("list",subMap.toString());
return map;
}
WebHttpClient.java
package org.jun.utils;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
/**
* @author xiejunbo
* */
public class WebHttpClient {
public WebHttpClient(){
}
public String getWebContentByGet(String urlString, final String charset,
int timeout) throws IOException {
if (urlString == null || urlString.length() == 0) {
return null;
}
urlString = (urlString.startsWith("http://") || urlString
.startsWith("https://")) ? urlString : ("http://" + urlString)
.intern();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
// 增加报头,模拟浏览器,防止屏蔽
conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
//conn.setRequestProperty("User-Agent","Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10");
// 只接受text/html类型,当然也可以接受图片,pdf,*/*任意,就是tomcat/conf/web里面定义那些
conn.setRequestProperty("Accept", "text/html");
conn.setConnectTimeout(timeout);
try {
if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
return null;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
InputStream input = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(input,charset));
String line = null;
StringBuffer sb = new StringBuffer();
while ((line = reader.readLine()) != null) {
sb.append(line).append("\r\n");
}
if (reader != null) {
reader.close();
}
if (conn != null) {
conn.disconnect();
}
return sb.toString();
}
public String getWebContentByGet(String urlString) throws IOException {
return getWebContentByGet(urlString, "iso-8859-1", 5000);
}
public String getWebContentByPost(String urlString,String data, final String charset,
int timeout)throws IOException{
if (urlString == null || urlString.length() == 0) {
return null;
}
urlString = (urlString.startsWith("http://") || urlString
.startsWith("https://")) ? urlString : ("http://" + urlString).intern();
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
// 设置是否向connection输出,因为这个是post请求,参数要放在 http正文内,因此需要设为true
connection.setDoOutput(true);
connection.setDoInput(true);
connection.setRequestMethod("POST");
// Post 请求不能使用缓存
connection.setUseCaches(false);
connection.setInstanceFollowRedirects(true);
connection.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
// 增加报头,模拟浏览器,防止屏蔽
connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows vista)");
// 只接受text/html类型,当然也可以接受图片,pdf,*/*任意
connection.setRequestProperty("Accept", "text/xml");
connection.setConnectTimeout(timeout);
connection.connect();
DataOutputStream out = new DataOutputStream(connection.getOutputStream());
String content = URLEncoder.encode(data, "utf-8");//+URLEncoder.encode("中文 ", "utf-8");
out.writeBytes(content);
out.flush();
out.close();
try {
//必须写在发送数据的后面
if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) {
return null;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),charset));
String line;
StringBuffer sb=new StringBuffer();
while ((line = reader.readLine()) != null) {
sb.append(line).append("\r\n");
}
if (reader != null) {
reader.close();
}
if (connection != null) {
connection.disconnect();
}
return sb.toString();
}
public String getWebContentByPost(String urlString,String data) throws IOException {
return getWebContentByPost(urlString, data,"iso-8859-1", 5000);
}
public static void main(String[] args) throws IOException {
WebHttpClient client=new WebHttpClient();
//String s = client.getWebContentByGet("http://www.baidu.com");
//s = new String(s.getBytes("iso-8859-1"), "gb2312");
String s = client.getWebContentByPost("http://localhost:8080/Lottery/login.portal","action=login&loginname=13761083826&password=111111");
s = new String(s.getBytes("iso-8859-1"), "UTF-8");
System.out.println(s);
}
}