java中使用无头浏览器获取web数据

需要用到谷歌驱动程序 chromedriver.exe

pom.xml   文件

```xml
<properties>
        <maven.compiler.source>17</maven.compiler.source>
        <maven.compiler.target>17</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>4.22.0</version>
        </dependency>
    </dependencies>
```

closeLoginPanel.js 文件

const styleEl = document.createElement("style");
styleEl.innerHTML = "div[id^=login-full-panel-] {display: none;}";
document.head.appendChild(styleEl);


//scrollToBottom.js
document.title = "滚动到底部:开始";
return new Promise((resolve) => {
  let scrollHeight = document.body.scrollHeight;
  window.scrollTo(0, scrollHeight);
  let timer = setInterval(() => {
    if (scrollHeight < document.body.scrollHeight) {
      scrollHeight = document.body.scrollHeight;
      window.scrollTo(0, scrollHeight);
    } else {
      clearInterval(timer);
      document.title = "滚动到底部:结束";
      resolve();
    }
  }, 3000);
});

Main.java 

package org.example;

import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;


public class Main {
    // 读取JS文件,文件必须使用utf-8编码
    public static String readJS(String filePath) {
        try {
            byte[] encoded = Files.readAllBytes(Paths.get(filePath));
            return new String(encoded, StandardCharsets.UTF_8);
        } catch (IOException e) {
            return "alert('" + filePath + "文件读取失败')";
        }
    }


    // 获取所有用户的主页链接
    public static List<HashMap> getUserInfoList(WebDriver driver) {
        List<WebElement> cardElList = driver.findElements(By.cssSelector(".search-result-card"));
        List<HashMap> userInfoList = new ArrayList<>();
        for (WebElement cardEl : cardElList) {
            HashMap map = new HashMap();
            // URL
            String href = cardEl.findElement(By.tagName("a")).getAttribute("href");
            int indexOfQuestionMark = href.indexOf('?');
            if (indexOfQuestionMark != -1) {
                href = href.substring(0, indexOfQuestionMark);
            }
            map.put("url", href);
            // 头像
            String avatar = cardEl.findElement(By.tagName("img")).getAttribute("src");
            map.put("avatar", avatar);
            // 用户名
            String userName = cardEl.findElement(By.cssSelector("a>div>div>div>p>span>span>span>span>span>span")).getText().trim();
            map.put("userName", userName);
            // 抖音号
            String douyinAccount = cardEl.findElement(By.cssSelector("a>div:nth-child(2)>span:nth-child(1)>span")).getText().trim();
            map.put("douyinAccount", douyinAccount);
            // 获赞
            String likeNumber = cardEl.findElement(By.cssSelector("a>div:nth-child(2)>span:nth-child(3)")).getText().split("获赞")[0].trim();
            map.put("likeNumber", likeNumber);
            // 粉丝
            String fansNumber = cardEl.findElement(By.cssSelector("a>div:nth-child(2)>span:nth-child(5)")).getText().split("粉丝")[0].trim();
            map.put("fansNumber", fansNumber);
            // 简介
            String intro = cardEl.findElement(By.cssSelector("a>p>span>span>span>span>span>span")).getText();
            map.put("intro", intro);
            // 添加
            userInfoList.add(map);
        }
        return userInfoList;
    }

    public static void main(String[] args) {
        ChromeOptions options = new ChromeOptions();
        options.addArguments("user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1");
        options.addArguments("disable-blink-features=AutomationControlled");
        System.setProperty("webdriver.chrome.driver", "./chromedriver.exe");
        WebDriver driver = new ChromeDriver(options);

        try {
            // 打开页面
            driver.get("https://www.douyin.com/search/%E6%A6%B4%E9%91%AB%E7%94%84%E9%80%89%E5%B0%8F%E6%97%B6%E8%BE%BE?type=user");
            Thread.sleep(6000);
            driver.manage().timeouts().setScriptTimeout(Duration.ofSeconds(1800));

            // 无论有没有登录框都CSS隐藏登录框
            ((JavascriptExecutor) driver).executeScript(readJS("./closeLoginPanel.js"));

            System.out.println("滚动到底部:开始");
            ((JavascriptExecutor) driver).executeScript(readJS("./scrollToBottom.js"));
            WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(1800));
            wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath("//div[text()='暂时没有更多了']")));
            System.out.println("滚动到底部:结束");

            // 采集用户
            System.out.println("采集用户:开始");
            List<HashMap> userInfoList = getUserInfoList(driver);
            System.out.println("用户主页信息:");
            System.out.println(userInfoList);
            System.out.println("采集用户:结束");

        } catch (WebDriverException | InterruptedException e) {
            throw new RuntimeException(e);
        } finally {
//            driver.quit();
        }
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值