此篇是抓取运行js后的网页,若是须要解析DOM能够结合Jsoup;html
1. 下载PhantomJS
2. PhantomJS整合
解压下载zip后把PhantomJS.exe拷贝到项目目录下;web
3. PhantomJS须要js文件
PhantomJS须要经过js文件获取html数据流,在项目的目录下建立js文件;app
js文件代码:
system = require('system')
address = system.args[1];// 得到命令行第二个参数 接下来会用到
// console.log('Loading a web page');
var page = require('webpage').create();
var url = address;
// console.log(url);
page.open(url, function(status) {
// Page is loaded!
if (status !== 'success') {
console.log('Unable to post!');
phantom.exit();
} else {
window.setTimeout(function() {
console.log(page.content);
phantom.exit();
}, 5000);
}
});
4. PhantomJS java获取html数据流
// 若是要更换运行环境,请注意exePath最后的phantom.exe须要更改。由于这个只能在window版本上运行。前面的路径名
// 也须要和exePath里面的保持一致。不然没法调用
private static String projectPath = System.getProperty("user.dir");
private static String jsPath = projectPath + File.separator
+ "Phantomjs2.js";
private static String exePath = projectPath + File.separator + "phantomjs"
+ File.separator + "bin" + File.separator + "phantomjs.exe";
/**
* 调用phantomjs程序,并传入js文件,并经过流拿回须要的数据。
*
* @param url
* @return
* @throws IOException
*/
public static String getParseredHtml(String url) {
StringBuffer sbf = new StringBuffer();
try {
Runtime rt = Runtime.getRuntime();
String uri = exePath + " " + jsPath + " " + url;
Process p = rt.exec(uri);
InputStream is = p.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String tmp = "";
while ((tmp = br.readLine()) != null) {
sbf.append(tmp);
}
br.close();
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return sbf.toString();
}
5. 调用PhantomJS获取动态网页
// 测试调用。传入url便可
String contentStr = getParseredHtml("https://www.tianyancha.com/");
System.out.println("html: " + contentStr);