使用selenium定时爬取网页内容-java版本

使用场景

某些网页有反扒机制,使用jsoup和httpclient不能满足要求,使用selenium可以。

环境配置

https://registry.npmmirror.com/binary.html?path=chromedriver/

下载解压即可,解压路径代码中会使用到。
 

pom引入

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.7.2</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.pshdhx.fund</groupId>
    <artifactId>tiantianjijin</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>tiantianjijin</name>
    <description>天天基金数据爬取</description>
    <properties>
        <java.version>1.8</java.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.4</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>


        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.8.4</version>
        </dependency>

        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>4.0.0</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.29</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
                <configuration>
                    <excludes>
                        <exclude>
                            <groupId>org.projectlombok</groupId>
                            <artifactId>lombok</artifactId>
                        </exclude>
                    </excludes>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

数据库设置

db.setting放入到resources里边

## db.setting文件

url = jdbc:mysql://xxxxxx:3306/tiantian_fund?useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai
user = root
pass = xxxxxxx

## 可选配置
# 是否在日志中显示执行的SQL
showSql = true
# 是否格式化显示的SQL
formatSql = false
# 是否显示SQL参数
showParams = true
# 打印SQL的日志等级,默认debug,可以是info、warn、error
sqlLevel = debug

定时任务代码引入

package com.pshdhx.fund;

import cn.hutool.db.Db;
import cn.hutool.db.Entity;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverLogLevel;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;

import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.util.Arrays;
import java.util.Date;
import java.util.List;

/**
 * @author pshdhx
 * @date 2022-08-01 13:23
 */
@Configuration      //1.主要用于标记配置类,兼备Component的效果。
@EnableScheduling   // 2.开启定时任务
public class ScheduleTask {
    //3.添加定时任务
    @Scheduled(cron = "0 0 10 ? * 2-6")
    //或直接指定时间间隔,例如:5秒
    //@Scheduled(fixedRate=5000)
    private void configureTasks() throws ParseException {
        //设置驱动
        System.setProperty("webdriver.chrome.driver", "D:\\new\\chromeDownload\\chromedriver_win32\\chromedriver.exe");
        ChromeOptions options = new ChromeOptions();
        options.addArguments("--headless");
        options.addArguments("--disable-gpu");
        options.setLogLevel(ChromeDriverLogLevel.OFF);


        //创建驱动
        WebDriver driver = new ChromeDriver(options);

        //与将要爬取的网站建立连接
        driver.get("https://fund.eastmoney.com/data/fundranking.html#tall;c0;r;szzf;pn200;ddesc;");
        WebElement searchBox = driver.findElement(By.id("dbtable"));
        System.err.println("==========================================================");
        WebElement tbody = searchBox.findElement(By.tagName("tbody"));
        String text = tbody.getText();
        String[] split = text.split("[\n]");
        List<String> content = Arrays.asList(split);
        String fund_code = null;
        String fund_name = "";
        Date fund_date = null;
        Double fund_day_rate = null;
        Double fund_week_rate = null;
        Double fund_month_rate = null;
        Double fund_month_3_rate = null;
        for (int i = 0; i < content.size(); i++) {
            if ((i + 1) % 4 == 1) {
                //获取序号
            }
            if ((i + 1) % 4 == 2) {
                //获取基金代码
                fund_code = content.get(i);
            }
            if ((i + 1) % 4 == 3) {
                //获取基金名称
                fund_name = content.get(i).toString();
            }
            if ((i + 1) % 4 == 0) {
                //获取基金净值信息
                String fund_values = content.get(i).toString();
                if (!"".equals(fund_values)) {
                    String[] valus = fund_values.split("[ ]");
                    if (valus.length > 5) {
                        fund_date = new SimpleDateFormat("yyyy-MM-dd").parse("2022-"+valus[0]);
                        if (valus[3].indexOf("%") != -1) {
                            String[] split1 = valus[3].split("[%]");
                            if(split1.length > 0){
                                fund_day_rate = Double.parseDouble(split1[0]);
                            }
                        }
                        if (valus[4].indexOf("%") != -1) {
                            String[] split1 = valus[4].split("[%]");
                            if(split1.length > 0){
                                fund_week_rate = Double.parseDouble(split1[0]);
                            }
                        }
                        if (valus[5].indexOf("%") != -1) {
                            String[] split1 = valus[5].split("[%]");
                            if(split1.length > 0){
                                fund_month_rate = Double.parseDouble(split1[0]);
                            }
                        }
                        if (valus[6].indexOf("%") != -1) {
                            String[] split1 = valus[5].split("[%]");
                            if(split1.length > 0){
                                fund_month_3_rate = Double.parseDouble(split1[0]);
                            }
                        }
                    }

                    try {
                        Db.use().insert(Entity.create("fund_info")
                                .set("fund_code",fund_code)
                                .set("fund_name",fund_name)
                                .set("fund_date",fund_date)
                                .set("fund_day_rate",fund_day_rate)
                                .set("fund_week_rate",fund_week_rate)
                                .set("fund_month_rate",fund_month_rate)
                                .set("fund_month_3_rate",fund_month_3_rate));
                    } catch (SQLException e) {
                        e.printStackTrace();
                    }

                }

            }

        }
    }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值