环境说明
- JDK 8
- Jsoup 1.14.3
- Selenium 3.141.59
- Chrome 119.0.6045.106
- ChromeDriver 119.0.6045.105
- MySQL 8.0.32(可选)
- Mybatis 3.5.10(可选)
环境准备
-
获取 chromedriver(注意版本要和Chrome浏览器版本对应) 下载地址
-
添加 Jsoup 和 Selenium 依赖
<!-- Jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.3</version> </dependency> <!-- Selenium --> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>3.141.59</version> </dependency>
-
添加 MySQL 和 Mybatis 依赖,用来将爬取的数据存入数据库(可选)
<!-- MySQL --> <dependency> <groupId>com.mysql</groupId> <artifactId>mysql-connector-j</artifactId> <version>8.0.32</version> </dependency> <!-- Mybatis --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.5.10</version> </dependency>
数据库配置信息
mybatis-config.xml:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<environments default="development">
<environment id="development">
<transactionManager type="JDBC"/>
<dataSource type="POOLED">
<property name="driver" value="com.mysql.cj.jdbc.Driver"/>
<property name="url" value="jdbc:mysql://localhost:3306/test"/>
<property name="username" value="root"/>
<property name="password" value="root"/>
</dataSource>
</environment>
</environments>
<mappers>
<package name="com.test.mapper"/>
</mappers>
</configuration>
Mapper 类:
package com.test.mapper;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Param;
public interface DishMapper {
@Insert("insert into dish_gyc(id, name, calorie, category, major) values(null, #{name}, #{calorie}, #{category}, #{major})")
void insertDishes(@Param("name") String name,
@Param("calorie") String calorie,
@Param("category") String category,
@Param("major") String major);
}
Mybatis 工具类:
package com.test.util;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
public class SqlSessionUtil {
public SqlSessionUtil() {}
private static SqlSessionFactory sqlSessionFactory;
static {
try {
sqlSessionFactory = new SqlSessionFactoryBuilder().build(Resources.getResourceAsStream("mybatis-config.xml"));
} catch (Exception e) {
e.printStackTrace();
}
}
public static SqlSession openSession(){
return sqlSessionFactory.openSession();
}
}
具体实现
package com.test;
import com.test.mapper.DishMapper;
import com.test.util.SqlSessionUtil;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.List;
public class Main {
public static void main(String[] args) throws InterruptedException {
// 指定chromedriver的路径
System.setProperty("webdriver.chrome.driver", "/chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.get("https://www.nutridata.cn/database/list?id=2&date=1698935671620"); // 打开网页
Thread.sleep(3000); // 等待网页的JavaScript代码加载完毕
String pageSource; // 保存获取到的网页源代码
Document document; // 网页源代码解析为 DOM 对象
List<WebElement> elements; // 保存获取到的当前页的数据表(包含10条数据)
WebElement btn_next; // 保存获取到的 “点击跳转下一页” 按钮
try(SqlSession sqlSession = SqlSessionUtil.openSession()){
DishMapper mapper = sqlSession.getMapper(DishMapper.class);
// 完整页(每页10条数据)
for (int i = 0; i < 2213; i++) {
pageSource = driver.getPageSource(); // 获取网页源代码
document = Jsoup.parse(pageSource); // 使用Jsoup解析整个网页的源代码
Elements items = document.getElementsByClass("el-table__row");
for (int j = 0; j < 10; j++) {
Elements cells = items.get(j).getElementsByClass("cell");
mapper.insertDishes(cells.get(0).text(), cells.get(1).text(), cells.get(2).text(), cells.get(3).text());
}
elements = driver.findElements(By.className("btn-next"));
btn_next = elements.get(1); // 共获取到两个同名按钮,第二个按钮为目标按钮
// btn_next.click(); // 直接点击获取到的按钮,可能会出现弹窗导致无法正常点击等异常
JavascriptExecutor executor = (JavascriptExecutor) driver;
executor.executeScript("arguments[0].click();", btn_next); // 使用原生 JS 执行点击按钮操作
Thread.sleep(2500);
}
// 最后一页(不足10条数据)
pageSource = driver.getPageSource();
document = Jsoup.parse(pageSource); //使用Jsoup解析整个网页的源代码
Elements items = document.getElementsByClass("el-table__row");
for (int j = 0; j < 4; j++) {
Elements cells = items.get(j).getElementsByClass("cell");
mapper.insertDishes(cells.get(0).text(), cells.get(1).text (), cells.get(2).text(), cells.get(3).text());
}
// 手动提交
sqlSession.commit();
}
driver.quit();
}
}