需要用到谷歌驱动程序 chromedriver.exe
pom.xml 文件
```xml
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.22.0</version>
</dependency>
</dependencies>
```
closeLoginPanel.js 文件
const styleEl = document.createElement("style");
styleEl.innerHTML = "div[id^=login-full-panel-] {display: none;}";
document.head.appendChild(styleEl);
//scrollToBottom.js
document.title = "滚动到底部:开始";
return new Promise((resolve) => {
let scrollHeight = document.body.scrollHeight;
window.scrollTo(0, scrollHeight);
let timer = setInterval(() => {
if (scrollHeight < document.body.scrollHeight) {
scrollHeight = document.body.scrollHeight;
window.scrollTo(0, scrollHeight);
} else {
clearInterval(timer);
document.title = "滚动到底部:结束";
resolve();
}
}, 3000);
});
Main.java
package org.example;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class Main {
// 读取JS文件,文件必须使用utf-8编码
public static String readJS(String filePath) {
try {
byte[] encoded = Files.readAllBytes(Paths.get(filePath));
return new String(encoded, StandardCharsets.UTF_8);
} catch (IOException e) {
return "alert('" + filePath + "文件读取失败')";
}
}
// 获取所有用户的主页链接
public static List<HashMap> getUserInfoList(WebDriver driver) {
List<WebElement> cardElList = driver.findElements(By.cssSelector(".search-result-card"));
List<HashMap> userInfoList = new ArrayList<>();
for (WebElement cardEl : cardElList) {
HashMap map = new HashMap();
// URL
String href = cardEl.findElement(By.tagName("a")).getAttribute("href");
int indexOfQuestionMark = href.indexOf('?');
if (indexOfQuestionMark != -1) {
href = href.substring(0, indexOfQuestionMark);
}
map.put("url", href);
// 头像
String avatar = cardEl.findElement(By.tagName("img")).getAttribute("src");
map.put("avatar", avatar);
// 用户名
String userName = cardEl.findElement(By.cssSelector("a>div>div>div>p>span>span>span>span>span>span")).getText().trim();
map.put("userName", userName);
// 抖音号
String douyinAccount = cardEl.findElement(By.cssSelector("a>div:nth-child(2)>span:nth-child(1)>span")).getText().trim();
map.put("douyinAccount", douyinAccount);
// 获赞
String likeNumber = cardEl.findElement(By.cssSelector("a>div:nth-child(2)>span:nth-child(3)")).getText().split("获赞")[0].trim();
map.put("likeNumber", likeNumber);
// 粉丝
String fansNumber = cardEl.findElement(By.cssSelector("a>div:nth-child(2)>span:nth-child(5)")).getText().split("粉丝")[0].trim();
map.put("fansNumber", fansNumber);
// 简介
String intro = cardEl.findElement(By.cssSelector("a>p>span>span>span>span>span>span")).getText();
map.put("intro", intro);
// 添加
userInfoList.add(map);
}
return userInfoList;
}
public static void main(String[] args) {
ChromeOptions options = new ChromeOptions();
options.addArguments("user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1");
options.addArguments("disable-blink-features=AutomationControlled");
System.setProperty("webdriver.chrome.driver", "./chromedriver.exe");
WebDriver driver = new ChromeDriver(options);
try {
// 打开页面
driver.get("https://www.douyin.com/search/%E6%A6%B4%E9%91%AB%E7%94%84%E9%80%89%E5%B0%8F%E6%97%B6%E8%BE%BE?type=user");
Thread.sleep(6000);
driver.manage().timeouts().setScriptTimeout(Duration.ofSeconds(1800));
// 无论有没有登录框都CSS隐藏登录框
((JavascriptExecutor) driver).executeScript(readJS("./closeLoginPanel.js"));
System.out.println("滚动到底部:开始");
((JavascriptExecutor) driver).executeScript(readJS("./scrollToBottom.js"));
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(1800));
wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath("//div[text()='暂时没有更多了']")));
System.out.println("滚动到底部:结束");
// 采集用户
System.out.println("采集用户:开始");
List<HashMap> userInfoList = getUserInfoList(driver);
System.out.println("用户主页信息:");
System.out.println(userInfoList);
System.out.println("采集用户:结束");
} catch (WebDriverException | InterruptedException e) {
throw new RuntimeException(e);
} finally {
// driver.quit();
}
}
}