java爬取网页内容,Jsoup获取网页内容

package com.example.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * java获取网页内容
 * @author lvgb
 *
 */
public class getHtml {
	
	/**
	 * 最简单的获取html页面内容的方法
	 * @param strURL
	 * @throws Exception
	 */
	public static void captureHtml(String strURL) throws Exception {
		URL url = new URL(strURL);
		HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
		//设置一个假的UA,访问某些网站出现403错误时,加上这个设置即可
		httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36)");
		InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), "utf-8");
		BufferedReader bufReader = new BufferedReader(input);
		String line = "";
		while ((line = bufReader.readLine()) != null) {
			System.out.println(line);
		}
	}
	/**
	 * getElementById(String id):通过id来获取
  getElementsByTag(String tagName):通过标签名字来获取
  getElementsByClass(String className):通过类名来获取
  getElementsByAttribute(String key):通过属性名字来获取
  getElementsByAttributeValue(String key, String value):通过指定的属性名字,属性值来获取
		通过Jsoup获取网页信息
	 * @param url
	 */
	public static void jsoup666(String url){
		try {
			Document  doc = Jsoup.connect(url).header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36)").get();
			Elements elementsByTag = doc.select("约满");
			/*Document doc = Jsoup.connect(url)
					  .data("query", "Java")
					  .userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36)")
					  .cookie("auth", "token")
					  .timeout(3000)
					  .post();*/
			System.out.println(elementsByTag);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) {
		try {
			//sss("https://blog.csdn.net/qichunren/article/details/83256146");
			String url = "https://www.91160.com/doctors/index/unit_id-105/dep_id-2354/docid-100220577.html";
			//captureHtml(url);
			//sss(url);
			jsoup666(url);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
/**
	 * 执行js里面的请求
	 * @param url
	 * @return
	 * @throws IOException
	 * @throws InterruptedException
	 */
	public static Document getDocument(String url) throws IOException, InterruptedException{
	       /*String url="https://www.marklines.com/cn/vehicle_sales/search_country/search/?searchID=587200";
	       Connection connect = Jsoup.connect(url).userAgent("")
	               .header("Cookie", "PLAY_LANG=cn; _plh=b9289d0a863a8fc9c79fb938f15372f7731d13fb; PLATFORM_SESSION=39034d07000717c664134556ad39869771aabc04-_ldi=520275&_lsh=8cf91cdbcbbb255adff5cba6061f561b642f5157&csrfToken=209f20c8473bc0518413c226f898ff79cd69c3ff-1539926671235-b853a6a63c77dd8fcc364a58&_lpt=%2Fcn%2Fvehicle_sales%2Fsearch&_lsi=1646321; _ga=GA1.2.2146952143.1539926675; _gid=GA1.2.1032787565.1539926675; _plh_notime=8cf91cdbcbbb255adff5cba6061f561b642f5157")
	               .timeout(360000000);
	       Document document = connect.get();*/
	      try {
	    	  WebClient wc = new WebClient(BrowserVersion.CHROME);
		       //是否使用不安全的SSL
		       wc.getOptions().setUseInsecureSSL(true);
		       //启用JS解释器,默认为true
		       wc.getOptions().setJavaScriptEnabled(true);
		       //禁用CSS
		       wc.getOptions().setCssEnabled(false);
		       //js运行错误时,是否抛出异常
		       wc.getOptions().setThrowExceptionOnScriptError(false);
		       //状态码错误时,是否抛出异常
		       wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
		       //是否允许使用ActiveX
		       wc.getOptions().setActiveXNative(false);
		       //等待js时间
		       wc.waitForBackgroundJavaScript(600*1000);
		       //设置Ajax异步处理控制器即启用Ajax支持
		       wc.setAjaxController(new NicelyResynchronizingAjaxController());
		       //设置超时时间
		       wc.getOptions().setTimeout(1000000);
		       //不跟踪抓取
		       wc.getOptions().setDoNotTrackEnabled(false);
		        WebRequest request=new WebRequest(new URL(url));
		        request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");
		        request.setAdditionalHeader("Cookie","PLAY_LANG=cn; _plh=b9289d0a863a8fc9c79fb938f15372f7731d13fb; PLATFORM_SESSION=39034d07000717c664134556ad39869771aabc04-_ldi=520275&_lsh=8cf91cdbcbbb255adff5cba6061f561b642f5157&csrfToken=209f20c8473bc0518413c226f898ff79cd69c3ff-1539926671235-b853a6a63c77dd8fcc364a58&_lpt=%2Fcn%2Fvehicle_sales%2Fsearch&_lsi=1646321; _ga=GA1.2.2146952143.1539926675; _gid=GA1.2.1032787565.1539926675; _plh_notime=8cf91cdbcbbb255adff5cba6061f561b642f5157");
		       
	           //模拟浏览器打开一个目标网址
	           HtmlPage htmlPage = wc.getPage(request);
	           //为了获取js执行的数据 线程开始沉睡等待
	           Thread.sleep(1000);//这个线程的等待 因为js加载需要时间的
	           //以xml形式获取响应文本
	           String xml = htmlPage.asXml();
	           //并转为Document对象return
	           return Jsoup.parse(xml);
	           //System.out.println(xml.contains("结果.xls"));//false
	       }  catch (IOException e) {
	           e.printStackTrace();
	       }
	       return null;
	   }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值