package com.example.utils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* java获取网页内容
* @author lvgb
*
*/
public class getHtml {
/**
* 最简单的获取html页面内容的方法
* @param strURL
* @throws Exception
*/
public static void captureHtml(String strURL) throws Exception {
URL url = new URL(strURL);
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
//设置一个假的UA,访问某些网站出现403错误时,加上这个设置即可
httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36)");
InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), "utf-8");
BufferedReader bufReader = new BufferedReader(input);
String line = "";
while ((line = bufReader.readLine()) != null) {
System.out.println(line);
}
}
/**
* getElementById(String id):通过id来获取
getElementsByTag(String tagName):通过标签名字来获取
getElementsByClass(String className):通过类名来获取
getElementsByAttribute(String key):通过属性名字来获取
getElementsByAttributeValue(String key, String value):通过指定的属性名字,属性值来获取
通过Jsoup获取网页信息
* @param url
*/
public static void jsoup666(String url){
try {
Document doc = Jsoup.connect(url).header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36)").get();
Elements elementsByTag = doc.select("约满");
/*Document doc = Jsoup.connect(url)
.data("query", "Java")
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36)")
.cookie("auth", "token")
.timeout(3000)
.post();*/
System.out.println(elementsByTag);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
try {
//sss("https://blog.csdn.net/qichunren/article/details/83256146");
String url = "https://www.91160.com/doctors/index/unit_id-105/dep_id-2354/docid-100220577.html";
//captureHtml(url);
//sss(url);
jsoup666(url);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 执行js里面的请求
* @param url
* @return
* @throws IOException
* @throws InterruptedException
*/
public static Document getDocument(String url) throws IOException, InterruptedException{
/*String url="https://www.marklines.com/cn/vehicle_sales/search_country/search/?searchID=587200";
Connection connect = Jsoup.connect(url).userAgent("")
.header("Cookie", "PLAY_LANG=cn; _plh=b9289d0a863a8fc9c79fb938f15372f7731d13fb; PLATFORM_SESSION=39034d07000717c664134556ad39869771aabc04-_ldi=520275&_lsh=8cf91cdbcbbb255adff5cba6061f561b642f5157&csrfToken=209f20c8473bc0518413c226f898ff79cd69c3ff-1539926671235-b853a6a63c77dd8fcc364a58&_lpt=%2Fcn%2Fvehicle_sales%2Fsearch&_lsi=1646321; _ga=GA1.2.2146952143.1539926675; _gid=GA1.2.1032787565.1539926675; _plh_notime=8cf91cdbcbbb255adff5cba6061f561b642f5157")
.timeout(360000000);
Document document = connect.get();*/
try {
WebClient wc = new WebClient(BrowserVersion.CHROME);
//是否使用不安全的SSL
wc.getOptions().setUseInsecureSSL(true);
//启用JS解释器,默认为true
wc.getOptions().setJavaScriptEnabled(true);
//禁用CSS
wc.getOptions().setCssEnabled(false);
//js运行错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnScriptError(false);
//状态码错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
//是否允许使用ActiveX
wc.getOptions().setActiveXNative(false);
//等待js时间
wc.waitForBackgroundJavaScript(600*1000);
//设置Ajax异步处理控制器即启用Ajax支持
wc.setAjaxController(new NicelyResynchronizingAjaxController());
//设置超时时间
wc.getOptions().setTimeout(1000000);
//不跟踪抓取
wc.getOptions().setDoNotTrackEnabled(false);
WebRequest request=new WebRequest(new URL(url));
request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");
request.setAdditionalHeader("Cookie","PLAY_LANG=cn; _plh=b9289d0a863a8fc9c79fb938f15372f7731d13fb; PLATFORM_SESSION=39034d07000717c664134556ad39869771aabc04-_ldi=520275&_lsh=8cf91cdbcbbb255adff5cba6061f561b642f5157&csrfToken=209f20c8473bc0518413c226f898ff79cd69c3ff-1539926671235-b853a6a63c77dd8fcc364a58&_lpt=%2Fcn%2Fvehicle_sales%2Fsearch&_lsi=1646321; _ga=GA1.2.2146952143.1539926675; _gid=GA1.2.1032787565.1539926675; _plh_notime=8cf91cdbcbbb255adff5cba6061f561b642f5157");
//模拟浏览器打开一个目标网址
HtmlPage htmlPage = wc.getPage(request);
//为了获取js执行的数据 线程开始沉睡等待
Thread.sleep(1000);//这个线程的等待 因为js加载需要时间的
//以xml形式获取响应文本
String xml = htmlPage.asXml();
//并转为Document对象return
return Jsoup.parse(xml);
//System.out.println(xml.contains("结果.xls"));//false
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}