爬取信息如下:
存储方式:mysql
爬取工具:selenium自动化
数据清洗:正则表达式
selenium获取数据:
获取数据:
# 需要获取数据的地址
url ='https://news.sina.cn/zt_d/yiqing0121'
# webdriver.Chrome需安装chromedriver.exe才能使用,安装方法自行baidu
driver = webdriver.Chrome()
# 用get方法打开链接
driver.get(url)
# 获取渲染后的源代码
page = driver.page_source
正则表达式清洗数据:
'''
对获取到的源码进行分析,寻找需要获取数据的特征,使用正则筛选信息
以治愈人数rebirth为例,数据所在位置为<span class="c3">79</span>
使用正则筛选出所有的与该结构相似的文本,\d*表示0个或多个数字
'''
findGX = re.findall(r'百色.*广东', page,re.M)
exisiting_list = re.findall(r'<span class="c7">\s+\d+\s+</span>',findGX[0])
total_list = data_list = re.findall(r'<span class="c2 hasEm">\d*\s*<em>',findGX[0])
rebirth_list = re.findall(r'<span class="c3">\d*</span>',findGX[0])
death_list = re.findall(r'<span class="c4">\d*</span>',findGX[0])
# 遍历数据,使用正则进一步清洗数据,将数据存入列表
for el in exisiting_list:
exisiting_COVID.append(re.search(r'>\s+(\d+)\s+<', el).group(1))
for tl in total_list:
total_COVID.append(re.search(r'>(\d+)\s+<', tl).group(1))
for rl in rebirth_list:
rebirth.append(re.search(r'>(\d*)<', rl).group(1))
for dl in death_list:
death.append(re.search(r'>(\d*)<', dl).group(1))
数据库使用:
连接数据库:
# host:服务器地址,port:服务器端口,user:用户名,password:密码,database:数据库名
conn = pymysql.connect(host='localhost', user='root', password='cloud666', database='test', port=3306)
cursor = self.conn.cursor()
创建数据库表:
# 获取日期
table = str(datetime.datetime.now().month) +'m' + str(datetime.datetime.now().day) + 'd'
# 创建数据库(city 城市,existing_COVID 现存确诊,total_COVID 累计确诊,death死亡人数,rebirth治愈人数),将表名设置为变量,以每天日期为表名
sql1 = '''
CREATE TABLE %s(
city varchar(10) not null ,
existing_COVID varchar(10) ,
total_COVID varchar(10) ,
death varchar(10) ,
rebirth varchar(10));
''' % table
# 执行sql语句,创建数据库表
cursor.execute(sql1)
插入数据:
# 插入数据语句,用format传入需要插入数据的表名
sql2 = '''
insert into {}(city,existing_COVID,total_COVID,death,rebirth) values(%s,%s,%s,%s,%s)
'''.format(table)
# 遍历列表数据,执行sql2插入语句,将数据存入数据库
for i in range(len(rebirth)):
self.cursor.execute(sql2,(city[i],exisiting_COVID[i],total_COVID[i],death[i],rebirth[i]))
self.conn.commit()
全部代码:
import datetime
import re
import pymysql
from selenium import webdriver
class yiqng():
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='localhost', user='root', password='cloud666', database='test', port=3306)
self.cursor = self.conn.cursor()
self.getdata()
def getdata(self):
url ='https://news.sina.cn/zt_d/yiqing0121'
driver = webdriver.Chrome()
driver.get(url)
page = driver.page_source
# 正则筛选信息
findGX = re.findall(r'百色.*广东', page,re.M)
exisiting_list = re.findall(r'<span class="c7">\s+\d+\s+</span>',findGX[0])
total_list = data_list = re.findall(r'<span class="c2 hasEm">\d*\s*<em>',findGX[0])
rebirth_list = re.findall(r'<span class="c3">\d*</span>',findGX[0])
death_list = re.findall(r'<span class="c4">\d*</span>',findGX[0])
city = ['百色','境外输入','南宁','崇左','柳州','桂林','梧州','北海','防城港','钦州','贵港','玉林','河池','贺州','来宾']
# 现存确诊
exisiting_COVID = []
# 累计确诊
total_COVID = []
# 治愈
rebirth = []
# 死亡
death = []
for el in exisiting_list:
exisiting_COVID.append(re.search(r'>\s+(\d+)\s+<', el).group(1))
for tl in total_list:
total_COVID.append(re.search(r'>(\d+)\s+<', tl).group(1))
for rl in rebirth_list:
rebirth.append(re.search(r'>(\d*)<', rl).group(1))
for dl in death_list:
death.append(re.search(r'>(\d*)<', dl).group(1))
table = str(datetime.datetime.now().month) +'m' + str(datetime.datetime.now().day) + 'd'
sql1 = '''
CREATE TABLE %s(
city varchar(10) not null ,
existing_COVID varchar(10) ,
total_COVID varchar(10) ,
death varchar(10) ,
rebirth varchar(10));
''' % table
self.cursor.execute(sql1)
sql2 = '''
insert into {}(city,existing_COVID,total_COVID,death,rebirth) values(%s,%s,%s,%s,%s)
'''.format(table)
for i in range(len(rebirth)):
self.cursor.execute(sql2,(city[i],exisiting_COVID[i],total_COVID[i],death[i],rebirth[i]))
self.conn.commit()
if __name__ == '__main__':
yiqng()