package com.yan.Pchong;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.logging.Level;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Pachong {
public static void main(String[] args) throws IOException {
Connection connect = Jsoup.connect("http://app.mi.com/");
connect.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
Document document = connect.get();
Elements elementsByClass = document.getElementsByClass("category-list");
elementsByClass.forEach(f ->{
Elements elementsByTag = f.getElementsByTag("a");
elementsByTag.forEach(f2 -> {
String attr = f2.attr("href");
if(!attr.endsWith("15")) {
try {
cha1("http://app.mi.com"+attr);
} catch (FailingHttpStatusCodeException | IOException e) {
e.printStackTrace();
}
}
});
});
}
public static void cha1(String url) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
// 屏蔽HtmlUnit等系统 log
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
System.out.println("Loading page now-----------------------------------------------: "+url);
// HtmlUnit 模拟浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
webClient.getOptions().setCssEnabled(false); // 禁用css支持
webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(5 * 1000); // 设置连接超时时间
HtmlPage page = webClient.getPage(url);
webClient.waitForBackgroundJavaScript(5 * 1000); // 等待js后台执行30秒
String pageAsXml = page.asXml();
// Jsoup解析处理
Document doc = Jsoup.parse(pageAsXml, url);
Element pngs = doc.getElementById("all-applist"); // 获取所有图片元素集
}
}
===================================================================
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.29</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>