通过java获取抖音用户主页信息(2020年9月)
环境准备
本篇文章基于sts编辑器,jdk1.8,maven3
项目结构如下图
执行结果如下图
html请求工具类
HttpUtils代码如下
package test1;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
/**
* <pre>
* 使用net.sourceforge.htmlunit获取完整的html页面,即完成后台js代码的运行
* </pre>
*/
public class HttpUtils {
/**
* 请求超时时间,默认30秒
*/
private int timeout = 30000;
/**
* 等待异步JS执行时间,默认20秒
*/
private int waitForBackgroundJavaScript = 20000;
private static HttpUtils httpUtils;
private HttpUtils() {
}
public static HttpUtils getInstance() {
if (httpUtils == null)
httpUtils = new HttpUtils();
return httpUtils;
}
public int getTimeout() {
return timeout;
}
/**
* 请求超时时间
*
* @param timeout
*/
public void setTimeout(int timeout) {
this.timeout = timeout;
}
public int getWaitForBackgroundJavaScript() {
return waitForBackgroundJavaScript;
}
/**
* 设置获取完整HTML页面时等待异步JS执行的时间
*/
public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
}
/**
* 将网页返回为解析后的文档格式
*/
public static Document parseHtmlToDoc(String html) throws Exception {
return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
Document doc = Jsoup.parse(str);
String result = doc.html().replace(" ", "");
return Jsoup.parse(result);
}
/**
* 获取页面文档字符串(等待异步JS执行)
*/
public String getHtmlPageResponse(String url) throws Exception {
String result = null;
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);//是否启用CSS
webClient.getOptions().setJavaScriptEnabled(true); //非常重要,启用JS,适用于页面加载后异步调用js
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
webClient.getOptions().setTimeout(timeout);//设置的请求超时时间
webClient.setJavaScriptTimeout(timeout);//设置JS执行的超时时间
HtmlPage page;
try {
page = webClient.getPage(url);
} catch (Exception e) {
webClient.close();
throw e;
}
webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//方法阻塞线程
result = page.asXml();
webClient.close();
return result;
}
/**
* 获取页面文档Document对象(等待异步JS执行)
*/
public Document getHtmlPageResponseAsDocument(String url) throws Exception {
return parseHtmlToDoc(getHtmlPageResponse(url));
}
}
通过junit的测试类
package test1;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
public class HttpUtilsTest {
private static final String TEST_URL = "用户主页url";
@Test
public void testGetHtmlPageResponseAsDocument() {
HttpUtils httpUtils = HttpUtils.getInstance();
httpUtils.setTimeout(30000);
httpUtils.setWaitForBackgroundJavaScript(30000);
try {
Document document = httpUtils.getHtmlPageResponseAsDocument(TEST_URL);
//TODO
//System.out.println(document);
Element element = document.getElementById("pagelet-user-info");//获取元素节点等
//System.out.println(element);
System.out.println("头像url:"+element.getElementsByTag("img").attr("src"));
System.out.println("昵称:"+element.getElementsByTag("p").get(0).text());
System.out.println(element.getElementsByTag("p").get(1).text());
System.out.println("签名:"+element.getElementsByTag("p").get(2).text());
System.out.println("关注:"+element.getElementsByTag("p").get(3).getElementsByAttributeValue("class", "num").get(0).text());
System.out.println("粉丝:"+element.getElementsByTag("p").get(3).getElementsByAttributeValue("class", "num").get(1).text());
System.out.println("赞:"+element.getElementsByTag("p").get(3).getElementsByAttributeValue("class", "num").get(2).text());
System.out.println("作品:"+element.getElementsByAttributeValue("class", "video-tab").first().getElementsByAttributeValue("class", "num").get(0).text());
System.out.println("喜欢:"+element.getElementsByAttributeValue("class", "video-tab").first().getElementsByAttributeValue("class", "num").get(1).text());
} catch (Exception e) {
e.printStackTrace();
}
}
}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.test1</groupId>
<artifactId>test1</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
</project>
以上为实现获取抖音主页用户信息的全部内容,随着抖音的不断更新,代码需要及时调整。本文会持续更新。初来乍到,不足之处还请大神指教。谢谢