htmlunit:
package com.example.demo2;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.*;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.HttpHostConnectException;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @description:
* @author: kaiba
* @date: 2019/12/27
*/
@Component
public class PaChongUtils {
private static WebClient webClient = new WebClient(BrowserVersion.BEST_SUPPORTED);
/**
* openUrl 方法
* 说明:
* 创建人:yangkai
* @param url:
* @return com.gargoylesoftware.htmlunit.html.HtmlPage
* @throws
*/
public static HtmlPage openUrl(String url,ProxyConfig proxyConfig){
webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
webClient.getOptions().setCssEnabled(false); // 禁用css支持
webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(10*1000); // 设置连接超时时间
webClient.getOptions().setUseInsecureSSL(true); // 启用ssl
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
if(proxyConfig!=null){
webClient.getOptions().setProxyConfig(proxyConfig);
}else {
}
webClient.waitForBackgroundJavaScript(500); // 等待js后台执行0.5秒
try {
HtmlPage htmlPage = webClient.getPage(url);
return htmlPage;
}catch (ConnectTimeoutException e){
e.printStackTrace();
return null;
}catch (HttpHostConnectException e){
e.printStackTrace();
return null;
}catch (SocketTimeoutException e){
e.printStackTrace();
return null;
}catch (IOException e){
e.printStackTrace();
return null;
}
}
// 正则匹配
public static List<String> getMatherSubstrs(String str, String regex) {
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
while (m.find()) {
list.add(m.group());
}
return list;
}
}
依赖:
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.29</version>
</dependency>
jsoup:
InetSocketAddress addr = new InetSocketAddress("223.199.17.183",9999);
Proxy HTTPproxy = new Proxy(Proxy.Type. HTTP , addr); // http 代理
Document document = Jsoup.connect("https://blog.csdn.net/qq_36441163/article/details/105995743")
.timeout(5*1000)
.proxy(null)
.get();
JXDocument jxd = new JXDocument(document.getElementsByTag("html"));
Element div = (Element) jxd.selOne("//*[@id=\"content_views\"]");
List<Element> ps=div.getElementsByTag("p");
for (int i = 0; i < ps.size(); i++) {
System.out.println(ps.get(i).text());
}
依赖:
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.2</version>
</dependency>