抓取页面:https://reports.ingenuity.com/rs/report/function?id=ING%3A48osn
主要代码
package com.ninemax.ak.html.v3;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.math.BigInteger;
import java.security.GeneralSecurityException;
import java.security.MessageDigest;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.conn.ssl.X509HostnameVerifier;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import com.ninemax.ak.base.SpringContextUtil;
import com.ninemax.ak.dao.CommonDao;
import com.ninemax.ak.html.HtmlParser;
/**
* HTTP 请求工具类
*
*/
public class HtmlHttpClient{
public static Logger log =Logger.getLogger(HtmlHttpClient.class);
public CommonDao commonDao = SpringContextUtil.getBean("commonDao");
HtmlParser parser = new HtmlParser();
private static PoolingHttpClientConnectionManager connMgr;
private static RequestConfig requestConfig;
private static final int MAX_TIMEOUT = 700000;
static {
// 设置连接池
connMgr = new PoolingHttpClientConnectionManager();
connMgr.setMaxTotal(100);
connMgr.setDefaultMaxPerRoute(connMgr.getMaxTotal());
RequestConfig.Builder configBuilder = RequestConfig.custom();
// 设置连接超时
configBuilder.setConnectTimeout(MAX_TIMEOUT);
// 设置读取超时
configBuilder.setSocketTimeout(MAX_TIMEOUT);
// 设置从连接池获取连接实例的超时
configBuilder.setConnectionRequestTimeout(MAX_TIMEOUT);
// 在提交请求之前 测试连接是否可用
configBuilder.setStaleConnectionCheckEnabled(true);
requestConfig = configBuilder.build();
}
/**
* @param args
*/
public static void main(String[] args) {
Long start = System.currentTimeMillis();
HtmlHttpClient util = new HtmlHttpClient();
// 主页面链接
String url = "https://reports.ingenuity.com/rs/report/function?id=ING%3A48osn";
// 获取HTML
String html = doPostSSL(url);
Map<String,Object> map = new HashMap<String,Object>();
map.put("urlname", "DiseaseOrFunction");
map.put("originalurl", "/rs/report/function?id=ING%3A48osn");
map.put("wholeurl", url);
map.put("mdurl", md5Encode(url));
map.put("rank", 1);
List<Map<String,Object>> paramList = new ArrayList<Map<String,Object>>();
paramList.add(map);
try {
// 保存主页面到数据库
util.saveHtml(paramList);
// 修改JS,CSS路径
html = modifyHtml(html);
// 处理各个URL链接
util.handleHtml(html);
System.out.println("总用时:" + (System.currentTimeMillis()-start));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 发送 SSL POST 请求(HTTPS),K-V形式
* @param apiUrl API接口URL
* @param params 参数map
* @return
*/
public static String doPostSSL(String apiUrl) {
CloseableHttpClient httpClient = HttpClients.custom()
.setSSLSocketFactory(createSSLConnSocketFactory())
.setConnectionManager(connMgr)
.setDefaultRequestConfig(requestConfig).build();
HttpPost httpPost = new HttpPost(apiUrl);
CloseableHttpResponse response = null;
String httpStr = null;
try {
// 设置连接池
httpPost.setConfig(requestConfig);
// 设置头信息
httpPost.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpPost.setHeader("Accept-Encoding", "gzip, deflate");
httpPost.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
httpPost.setHeader("Cookie", "JSESSIONID=iprsapp3.ingenuity.com_8124~55A91D52EB5064E44BB9AC1F8AF5B274.iprsapp3; APP_SESSION_ID=MTAzNzExNDc3cEN2SEpUWHBrcW1FbGlicG9wTkRhWm5NYlhPY1FkaG5pM1U6YmFmNmY0YWM2ODM5NjkzYzRjNmI0MDA2YWYxMjhhNTA; REPORTS_VISITED=https%3A%2F%2Freports.ingenuity.com; IPA_VISITED=https%3A%2F%2Fanalysis.ingenuity.com; CASTGC=TGT-15030-uTfbo4QLEheIHfT4e6rqVODZWNB9uQe6coztlgCbTT9Ea3bDPq-cas");
httpPost.setHeader("Host", "reports.ingenuity.com");
httpPost.setHeader("Referer", "https://reports.ingenuity.com/rs/report/function?id=ING%3A48osn");
httpPost.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0");
// 执行方法
response = httpClient.execute(httpPost);
// 获取响应码
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
System.out.println("Error StatusCode:" + statusCode);
}
// 获取响应结果
HttpEntity entity = response.getEntity();
if (entity != null) {
// 获取HTML
httpStr = EntityUtils.toString(entity, "utf-8");
}else{
return null;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
EntityUtils.consume(response.getEntity());
} catch (IOException e) {
e.printStackTrace();
}
}
}
return httpStr;
}
/**
* 创建SSL安全连接
*
* @return
*/
@SuppressWarnings("all")
private static SSLConnectionSocketFactory createSSLConnSocketFactory() {
SSLConnectionSocketFactory sslsf = null;
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
return true;
}
}).build();
sslsf = new SSLConnectionSocketFactory(sslContext, new X509HostnameVerifier() {
public boolean verify(String arg0, SSLSession arg1) {
return true;
}
public void verify(String host, SSLSocket ssl) throws IOException {
}
public void verify(String host, X509Certificate cert) throws SSLException {
}
public void verify(String host, String[] cns, String[] subjectAlts) throws SSLException {
}
});
} catch (GeneralSecurityException e) {
e.printStackTrace();
}
return sslsf;
}
public void saveHtml(List<Map<String,Object>> paramList){
try {
commonDao.saveList(null, "ingenuity", paramList, stringList("urlname,originalurl,wholeurl,mdurl,rank", ","));
} catch (Exception e) {
e.printStackTrace();
}
}
public static String modifyHtml (String html){
// 修改JS
html = html.replace("src=\"/rs/javascripts/", "src=\"./rs/");
// 修改CSS
html = html.replace("href=\"/rs/stylesheets/","href=\"./rs/");
return html ;
}
public void handleHtml(String html){
String div_html = parser.get_TagNode_Html(html, "div", "id", "moleculeParentId");
// 前缀
String prefix = "https://reports.ingenuity.com";
// 获取网页所有链接
List<String> span_html_List = parser.get_TagNode_HtmlList(div_html, "span");
// 遍历A标签
for(String li_html : span_html_List){
// 数据集合
List<Map<String, Object>> paramList = new ArrayList<Map<String,Object>>();
Map<String,Object> map = new HashMap<String,Object>();
String li_name = parser.get_LinkTag_text(li_html, "a");
String li_href = parser.get_TagNode_attr(li_html, "a", "href");
String htmlName = prefix + li_href;
// 查询数据库
try {
int count = commonDao.queryString("SELECT * FROM ingenuity WHERE mdurl = '" + md5Encode(htmlName)+"'" );
// 若数据库不存在,则存入数据库
if(count == 0){
System.out.println("页面不存在......");
map.put("urlname", li_name);
map.put("originalurl", li_href);
map.put("wholeurl", htmlName);
map.put("mdurl", md5Encode(htmlName));
map.put("rank", "2");
paramList.add(map);
// 获取详情页
String detail_html = doPostSSL(htmlName);
// 修改JS、CSS路径
detail_html = modifyHtml(detail_html);
// 将页面保存在本地
writeHTML2txt(detail_html, md5Encode(htmlName));
// 保存数据
saveHtml(paramList);
// 修改主页面链接
html = html.replace(li_href, md5Encode(htmlName) + ".html");
// 延时操作(5-10秒)
Thread.sleep(((int) (5 + Math.random() * 6))*1000);
}else{
System.out.println("页面已存在......");
}
} catch (Exception e) {
e.printStackTrace();
}
}
try {
// 将主页面保存在本地
writeHTML2txt(html, "finallyindex");
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 保存页面
* @param html
* @param htmlName
* @throws IOException
*/
public static void writeHTML2txt(String html, String htmlName) throws IOException {
String savePath = "G:/ingenuity/" + htmlName + ".html";
File f = new File(savePath);
FileWriter fw = new FileWriter(f);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(html);
bw.close();
}
public static String md5Encode(String str){
byte [] result = null;
if(str.isEmpty()){
return null;
}
try {
result = MessageDigest.getInstance("md5").digest(str.getBytes("utf-8"));
} catch (Exception e) {
e.printStackTrace();
}
return new BigInteger(1, result).toString(16);
}
public static List<String> stringList(String in, String split) {
String[] ss = in.split(split);
List<String> tags = new ArrayList<String>();
for (String s : ss) {
if (s.trim().length() > 0) {
tags.add(s.trim());
}
}
return tags;
}
}
HTML解析类
package com.ninemax.ak.html;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HtmlParser {
public static Logger log = Logger.getLogger(HtmlParser.class);
/**
* 获取HTML节点
* @param html
* @param TagNodeName 节点名称
* @return HTML文本
*/
public String get_TagNode_Html(String html,String TagNodeName){
String subhtml = "";
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeList nodes = parser.parse(tagNode);
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
subhtml = textnode.toHtml();
}
} catch (ParserException e) {
e.printStackTrace();
}
return subhtml;
}
/**
* 获取HTML节点
* @param html
* @param TagNodeName 节点
* @param setAttrName 属性名
* @param setAttrValue 属性值
* @return HTML文本
*/
public String get_TagNode_Html(String html, String TagNodeName,String setAttrName, String setAttrValue) {
String second_details_html = "";
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeFilter attrNode_name = new HasAttributeFilter(setAttrName,setAttrValue);
NodeFilter andNode = new AndFilter(tagNode, attrNode_name);
NodeList nodes = parser.extractAllNodesThatMatch(andNode);
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
second_details_html = textnode.toHtml();
}
} catch (ParserException e) {
e.printStackTrace();
}
return second_details_html;
}
/**
* 获取HTML节点数组
* @param html
* @param TagNodeName 节点名称
* @return 节点数组
*/
public List<String> get_TagNode_HtmlList(String html,String TagNodeName) {
List<String> result = new ArrayList<String>();
NodeList nodes=null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if (nodes != null && nodes.size() > 0) {
for (int i = 0; i < nodes.size(); i++) {
TagNode textnode = (TagNode) nodes.elementAt(i);
String s = textnode.toHtml();
if (!HtmlUtil.isEmptyTrim(s)) {
result.add(s);
}
}
}
return result;
}
/**
* 获取指定属性HTML节点
* @param html
* @param TagNodeName 节点名称
* @param setAttrName 属性名称
* @param setAttrValue 属性值
* @return HTML文本集合
*/
public List<String> get_TagNode_HtmlList(String html,String TagNodeName ,String setAttrName ,String setAttrValue) {
List<String> result = new ArrayList<String>();
NodeList nodes = null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeFilter attrNode_name = new HasAttributeFilter(setAttrName,setAttrValue);
NodeFilter andNode = new AndFilter(tagNode, attrNode_name);
nodes = parser.extractAllNodesThatMatch(andNode);
} catch (ParserException e) {
log.error("html:" + html, e);
}
if (nodes != null && nodes.size() > 0) {
for (int i = 0; i < nodes.size(); i++) {
TagNode textnode = (TagNode) nodes.elementAt(i);
String s = textnode.toHtml();
if (!HtmlUtil.isEmptyTrim(s)) {
result.add(s);
}
}
}
return result;
}
/**
* 获取HTML节点的属性
* @param html
* @param TagNodeName 节点名称
* @param getAttrName 属性名称
* @return
*/
public String get_TagNode_attr(String html, String TagNodeName,String getAttrName) {
String attr = "";
NodeList nodes = null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
attr = textnode.getAttribute(getAttrName);
}
return attr;
}
/**
* 获取A标签链接名称
* @param html
* @param TagNodeName A
* @return
*/
public String get_LinkTag_text(String html,String TagNodeName) {
String text="";
NodeList nodes=null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode=new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if(nodes!=null&&nodes.size()>0) {
LinkTag textnode = (LinkTag) nodes.elementAt(0);
text = textnode.getLinkText().trim();
}
return text;
}
}
Remark:QQ交流群:260052172