静态爬取
public class HttpUtil {
public static Document get(String url, String charset) throws IOException {
String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
URL url2 = new URL(url);
HttpURLConnection connection = (HttpURLConnection) url2.openConnection();
connection.setRequestMethod("GET");
// 是否允许缓存,默认true
connection.setUseCaches(Boolean.FALSE);
// 设置请求头信息
connection.addRequestProperty("Connection", "close");
connection.addRequestProperty("user-agent", userAgent);
// 设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(80000);
// 设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(80000);
// 开始请求
try {
Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
return doc;
} catch (Exception e) {
System.out.println("parse error: " + url);
e.printStackTrace();
}
return null;
}
}
public static void testDoc(){
String url = "www.baidu.com";
Document doc = null;
try {
doc = HttpUtil.get(url , "utf-8");
System.out.println("doc:" + doc);
} catch (IOException e) {
e.printStackTrace();
}
}
动态爬取
有时候仅仅静态爬取并爬不到想要的数据,需要对渲染后的页面进行爬取
1.可以通过内置浏览器内核,获取js渲染后的页面,这类工具有Selenium、HtmlUnit或者PhantomJs。因为PhantomJs要下载对应的浏览器内核,但是我本地和服务器操作系统不一样,所以没有选择PhantomJs,而HtmlUnit报错太多也没选择
2.cdb4j:调用 本地chrome 浏览器,然而在服务器上可能因为服务器没有chrome报错
3.最后选择了直接读取对应请求获取json数据。
下面是cdb4j的示例
<dependency>
<groupId>io.webfolder</groupId>
<artifactId>cdp4j</artifactId>
<version>2.2.1</version>
</dependency>
public static void cdp(String url ){
ArrayList<String> command = new ArrayList<String>();
//不显示google 浏览器
command.add("--headless");
Launcher launcher = new Launcher();
try (SessionFactory factory = launcher.launch(command);
Session session = factory.create()) {
session.navigate(url);
session.waitDocumentReady();
String content = (String) session.getContent();
Document doc = Jsoup.parse(content);
Element dateElement = doc.getElementsByClass("post__description").get(0);
System.out.println("text:" + dateElement.text());
Element docElements = doc.getElementsByClass("post__body").get(0);
int i = 5;
Elements liElements = docElements.getElementsByTag("li");
for (Element element : liElements) {
if (i <= 0) break;
Element el = element.select("a").first();
String href = el != null ? el.attr("href") : "";
href = pre + href;
String title = element.select("a").text();
System.out.println(" href:" + href +" title:" + title);
i--;
}
System.out.println("end");
}