使用 selenium 爬取 官方公布的疫情信息,mysql + superset展示
工程文件下载: 链接:https://pan.baidu.com/s/1dGx5RMVs3akD2AlpC1aXWQ?pwd=avum
提取码:avum
1.配置类
package source
import org.openqa.selenium.By
object provinceSource {
trait Regx(var In_confirm: List[(String, Int)], var Out_confirm: List[(String, Int)],
var In_suspectedRegxList: List[(String, Int)], var Out_suspectedRegxList: List[(String, Int)],
var now_confirm: List[(String, Int)], var now_suspected: List[(String, Int)],
var all_confirm: List[(String, Int)], var all_suspected: List[(String, Int)],
var today_heal: List[(String, Int)], var today_relieve: List[(String, Int)],
var all_heal: List[(String, Int)], var all_relieve: List[(String, Int)])
trait Code(var area_code: String, var iso_code: String)
val guoJia ="http://www.nhc.gov.cn/xcs/xxgzbd/gzbd_index.shtml"
val guoJiaXpath =By.xpath("/html/body/div[3]/div[2]/ul")
val guoJiaXpath2 ="/html/body/div[3]/div[2]/div[3]"
val guoJiaCssSelecter = "div p"
class guoJiaCode extends Code("320000", "CN-32")
class guoJiaRegx extends Regx(List(("新增本土确诊病例(\\d+)例", 1)), List(("新增境外.*确诊病例(\\d+)例", 1)),
List(("新增本土无症状感染者(\\d+)例",1)), List(("新增境外输入无症状感染者(\\d+)例",1)),
List(("隔离治疗.*确诊病例(\\d+)例",1)), List(("隔离医学管理的无症状感染者(\\d+)例", 1)),
List(), List(),
List(("新增出院病例(\\d+)例",1)), List(("解除隔离医学管理的无症状感染者(\\d+)例",1)),
List(), List())
}
2.主类
import org.openqa.selenium.{By, WebDriver, WebElement}
import org.openqa.selenium.chrome.ChromeDriver
import source.provinceSource.*
import java.sql.Statement
import java.text.SimpleDateFormat
import java.util
import java.util.logging.Logger
import java.util.regex.Matcher
import scala.collection.mutable
import collection.JavaConverters.*
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
object pulgs {
def main(args: Array[String]): Unit = {
//设置驱动
System.setProperty("webdriver.chrome.driver", "C:\\Users\\y\\Desktop\\superSet\\chromedriver\\chromedriver.exe")
val fixedThreadPool = Executors.newFixedThreadPool(5)
fixedThreadPool.execute(()=>{
provinceCreeper("国家卫健委", guoJia, guoJiaXpath, guoJiaXpath2, guoJiaCssSelecter,guoJiaCode(), guoJiaRegx())
})
}
def provinceCreeper(provinceName: String, indexUrl: String, xpath1: By, xpath2: String, cssSelecter: String,code: Code, regx: Regx) = {
//下载地址:http://chromedriver.storage.googleapis.com/index.html
//创建驱动
var driver = new ChromeDriver
//val driver = new FirefoxDriver()
driver.get(indexUrl)
println("\n")
println(s"----------- ${provinceName}")
println(s"[$provinceName] index "+indexUrl)
val offUrl = driver.findElement(By.xpath("/html/body/div[3]/div[2]/ul"))
val lis = offUrl.findElements(By.tagName("li")).asScala
val li = if(provinceName.equals("安徽")) 1 else 0
val currUrl: String = lis(li).findElement(By.tagName("a")).getAttribute("href")
println(s"[$provinceName] second-index "+currUrl)
driver.get(currUrl)
val element0 = driver.findElement(By.xpath(xpath2))
val elements = element0.findElements(By.cssSelector(cssSelecter)).asScala
val lineArr: mutable.Buffer[String] = elements.map(p => {
print(s"[国家卫健委] "+p.getText)
p.getText
})
RegexMatches.matchLineGuojia(lineArr)
println("-----------End\n")
}
}
3.正则处理类
import source.provinceSource.{Code, Regx}
import java.text.SimpleDateFormat
import java.util.regex.{Matcher, Pattern}
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.{BufferedSource, Source}
object RegexMatches {
def getMap(): mutable.HashMap[String, String] = {
val source: BufferedSource = Source.fromFile("lib/map.txt")
val buffer = new mutable.HashMap[String, String]()
for (line <- source.getLines()) {
val strings = line.split(" ")
buffer += strings(0) -> strings(1)
}
buffer
}
def matchLineGuojia(lineArr: mutable.Buffer[String]): sqlbean = {
val getCurDate = new SimpleDateFormat("yyyy-MM-dd").format(System.currentTimeMillis())
val map = getMap()
map.foreach(entry => {
var in_confirm = 0
var in_suspected = 0
var out_confirm = 0
var out_suspected = 0
var today_infestor = 0
var now_confirm = "未知"
var now_suspected = "未知"
var all_confirm = "未知"
var all_suspected = "未知"
var today_heal = "未知"
var today_relieve = "未知"
var all_heal = "未知"
var all_relieve = "未知"
val provinceName = entry._1
val area_code = entry._2
val iso_code = s"CN-${area_code}"
lineArr.foreach(line => {
//当日各省确诊
if (line.matches("^\\d+月\\d+日(.*?)31个省.*")) then
val regStr = s"${provinceName}(\\d+)例"
val matcher = Pattern.compile(regStr).matcher(line)
if (matcher.find()) in_confirm = matcher.group(1).toInt
//当日各省无症状
if (line.matches(".*新增无症状感染者.*")) then
val regStr = s"${provinceName}(\\d+)例"
val matcher = Pattern.compile(regStr).matcher(line)
if (matcher.find()) in_suspected = matcher.group(1).toInt
//当日新增治愈
if (line.matches(".*新增治愈.*")) then
val regStr = s"${provinceName}(\\d+)例"
val matcher = Pattern.compile(regStr).matcher(line)
if (matcher.find()) today_heal = matcher.group(1)
//当日解除观察
if (line.matches(".*解除医学观察的无症状感染者.*")) then
val regStr = s"${provinceName}(\\d+)例"
val matcher = Pattern.compile(regStr).matcher(line)
if (matcher.find()) today_relieve = matcher.group(1)
})
val bean: sqlbean = sqlbean(ds = getCurDate, province_name = provinceName, area_code = area_code, iso_code = iso_code,
in_confirm = in_confirm, in_suspected = in_suspected, out_confirm = out_confirm, out_suspected = out_suspected,
today_infestor = today_infestor,
now_confirm = now_confirm, now_suspected = now_suspected,
all_confirm = all_confirm, all_suspected = all_suspected,
today_heal = today_heal, today_relieve = today_relieve,
all_heal = all_heal, all_relieve = all_relieve)
println(mysqlDb.execute(bean))
println(s"--${provinceName}--" + bean)
})
null
}
}
4.bean类
case class sqlbean(var ds: String, var province_name: String, var area_code: String, var iso_code: String, var in_confirm: Int, var in_suspected: Int,var out_confirm: Int,var out_suspected: Int,
var today_infestor:Int,
var now_confirm: String ,var now_suspected:String,
var all_confirm: String ,var all_suspected:String,
var today_heal: String ,var today_relieve:String,
var all_heal: String ,var all_relieve:String):
override def toString: String = s"境内确诊:$in_confirm 境内无症状:$in_suspected 境外确诊:$out_confirm 境外无症状:$out_suspected 当日感染者:$today_infestor " +
s"现存确诊:$now_confirm 现存无症状:$now_suspected " +
s"累计确诊:$all_confirm 累计无症状:$all_suspected " +
s"当日新增治愈:$today_heal 当日解除观察:$today_relieve " +
s"累计治愈出院:$all_heal 累计解除观察:$all_relieve "
4.1.sql工具类
import java.sql.Statement
object mysqlDb {
import java.sql.Connection
import java.sql.DriverManager
// 数据库连接四大属性// 数据库连接四大属性
def execute(sqlbean: sqlbean): String = {
val driver: String = "com.mysql.jdbc.Driver"
val url: String = "jdbc:mysql://localhost:3306/db1?characterEncoding=UTF-8"
val user: String = "root"
val password: String = "xxx"
// 加载驱动
Class.forName(driver)
// 获取数据库连接
val conn: Connection = DriverManager.getConnection(url, user, password)
// 编写SQL语句
val sql: String = s"INSERT INTO db1.cov19Case VALUES('${sqlbean.ds}','${sqlbean.province_name}','${sqlbean.area_code}','${sqlbean.iso_code}'," +
s"${sqlbean.in_confirm},${sqlbean.in_suspected},${sqlbean.out_confirm},${sqlbean.out_suspected},${sqlbean.today_infestor}," +
s"'${sqlbean.now_confirm}','${sqlbean.now_suspected}','${sqlbean.all_confirm}','${sqlbean.all_suspected}'," +
s"'${sqlbean.today_heal}','${sqlbean.today_relieve}','${sqlbean.all_heal}','${sqlbean.all_relieve}')" +
s"ON DUPLICATE KEY UPDATE in_confirm = ${sqlbean.in_confirm} , in_suspected = ${sqlbean.in_suspected} , out_confirm = ${sqlbean.out_confirm} ,out_confirm = ${sqlbean.out_confirm} , today_infestor = ${sqlbean.today_infestor} ," +
s"now_confirm = '${sqlbean.now_confirm}', now_suspected = '${sqlbean.now_suspected}',all_confirm = '${sqlbean.all_confirm}',all_suspected = '${sqlbean.all_suspected}'," +
s"today_heal = '${sqlbean.today_heal}',today_relieve = '${sqlbean.today_relieve}',all_heal = '${sqlbean.all_heal}',all_relieve = '${sqlbean.all_relieve}'"
// 创建SQL语句的执行对象
val statement: Statement = conn.createStatement()
// 执行SQL语句并获取结果
val count: Int = statement.executeUpdate(sql)
// 释放资源
statement.close
conn.close()
// 判断结果
val msg: String = if (count > 0) "(添加成功)" else "(添加失败)"
s"[${sqlbean.province_name}-Sql] ${sql}; ${msg}!"
}
}
superset报表只用了国家地图和直方图