通过java获取抖音用户主页信息（2020年9月）

最新推荐文章于 2025-03-11 14:41:28 发布

不丸子

最新推荐文章于 2025-03-11 14:41:28 发布

阅读量3.1k

点赞数 3

分类专栏： Java随笔文章标签： java maven http

本文链接：https://blog.csdn.net/qq_26993175/article/details/108318496

版权

Java随笔专栏收录该内容

53 篇文章

订阅专栏

本文介绍如何使用Java和HtmlUnit库获取抖音用户主页的详细信息，包括头像URL、昵称、个人简介、关注者数量、粉丝数量、点赞数、作品数和喜欢数。代码基于Maven项目，使用STS编辑器，JDK1.8，通过发送HTML请求并解析返回的HTML文档。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

通过java获取抖音用户主页信息（2020年9月）

环境准备

本篇文章基于sts编辑器，jdk1.8，maven3

项目结构如下图

在这里插入图片描述

执行结果如下图

在这里插入图片描述

html请求工具类

HttpUtils代码如下

package test1;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * <pre>
 * 使用net.sourceforge.htmlunit获取完整的html页面,即完成后台js代码的运行
 * </pre>
 */
public class HttpUtils {
    /**
     * 请求超时时间,默认30秒
     */
    private int timeout = 30000;
    /**
     * 等待异步JS执行时间,默认20秒
     */
    private int waitForBackgroundJavaScript = 20000;

    private static HttpUtils httpUtils;

    private HttpUtils() {
    }

    public static HttpUtils getInstance() {
        if (httpUtils == null)
            httpUtils = new HttpUtils();
        return httpUtils;
    }

    public int getTimeout() {
        return timeout;
    }

    /**
     * 请求超时时间
     *
     * @param timeout
     */
    public void setTimeout(int timeout) {
        this.timeout = timeout;
    }

    public int getWaitForBackgroundJavaScript() {
        return waitForBackgroundJavaScript;
    }

    /**
     * 设置获取完整HTML页面时等待异步JS执行的时间
     */
    public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
        this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
    }

    /**
     * 将网页返回为解析后的文档格式
     */
    public static Document parseHtmlToDoc(String html) throws Exception {
        return removeHtmlSpace(html);
    }

    private static Document removeHtmlSpace(String str) {
        Document doc = Jsoup.parse(str);
        String result = doc.html().replace("&nbsp;", "");
        return Jsoup.parse(result);
    }

    /**
     * 获取页面文档字符串(等待异步JS执行)
     */
    public String getHtmlPageResponse(String url) throws Exception {
        String result = null;

        final WebClient webClient = new WebClient(BrowserVersion.CHROME);

        webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
        webClient.getOptions().setActiveXNative(false);
        webClient.getOptions().setCssEnabled(false);//是否启用CSS
        webClient.getOptions().setJavaScriptEnabled(true); //非常重要，启用JS，适用于页面加载后异步调用js
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要，设置支持AJAX

        webClient.getOptions().setTimeout(timeout);//设置的请求超时时间
        webClient.setJavaScriptTimeout(timeout);//设置JS执行的超时时间

        HtmlPage page;
        try {
            page = webClient.getPage(url);
        } catch (Exception e) {
            webClient.close();
            throw e;
        }
        webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//方法阻塞线程

        result = page.asXml();
        webClient.close();

        return result;
    }

    /**
     * 获取页面文档Document对象(等待异步JS执行)
     */
    public Document getHtmlPageResponseAsDocument(String url) throws Exception {
        return parseHtmlToDoc(getHtmlPageResponse(url));
    }
}

通过junit的测试类

package test1;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;


public class HttpUtilsTest {
    private static final String TEST_URL = "用户主页url";
    @Test
    public void testGetHtmlPageResponseAsDocument() {
        HttpUtils httpUtils = HttpUtils.getInstance();
        httpUtils.setTimeout(30000);
        httpUtils.setWaitForBackgroundJavaScript(30000);
        try {
            Document document = httpUtils.getHtmlPageResponseAsDocument(TEST_URL);
            //TODO
            //System.out.println(document);
            
            Element element = document.getElementById("pagelet-user-info");//获取元素节点等
            
            //System.out.println(element);
            System.out.println("头像url："+element.getElementsByTag("img").attr("src"));
            System.out.println("昵称："+element.getElementsByTag("p").get(0).text());
            System.out.println(element.getElementsByTag("p").get(1).text());
            System.out.println("签名："+element.getElementsByTag("p").get(2).text());
           
            System.out.println("关注："+element.getElementsByTag("p").get(3).getElementsByAttributeValue("class", "num").get(0).text());
            System.out.println("粉丝："+element.getElementsByTag("p").get(3).getElementsByAttributeValue("class", "num").get(1).text());
            System.out.println("赞："+element.getElementsByTag("p").get(3).getElementsByAttributeValue("class", "num").get(2).text());
            
            System.out.println("作品："+element.getElementsByAttributeValue("class", "video-tab").first().getElementsByAttributeValue("class", "num").get(0).text());
            System.out.println("喜欢："+element.getElementsByAttributeValue("class", "video-tab").first().getElementsByAttributeValue("class", "num").get(1).text());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.test1</groupId>
  <artifactId>test1</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <dependencies>
    <dependency>
        <groupId>net.sourceforge.htmlunit</groupId>
        <artifactId>htmlunit</artifactId>
        <version>2.27</version>
    </dependency>
    
    <dependency>
	    <groupId>org.jsoup</groupId>
	    <artifactId>jsoup</artifactId>
	    <version>1.8.3</version>
	</dependency>

<dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
</dependency>
</dependencies>
</project>