爬取每日疫情通报
知识点
https://blog.csdn.net/qq_35531549/article/details/87983243 定义日期,数字或字符转日期
https://blog.csdn.net/HoWaterSuper/article/details/82891278 日期比较
源代码
from fake_useragent import UserAgent
import requests
from lxml import etree
import datetime
import time
class Yqtb(object):
def __init__(self):
self.ua = UserAgent()
self.url = r'http://wjw.gz.gov.cn/ztzl/xxfyyqfk/yqtb/index.html'
self.content = ''
def parse_url(self,url):
headers = {"User-Agent":self.ua.random}
respose = requests.get(url,headers=headers).content.decode()
return respose
def get_url_list(self):
res=self.parse_url(self.url)
html = etree.HTML(res)
#print(type(html))
url_list = html.xpath("//div[@class='cont_list']/ul/li[2]//a/@href")
date_list = html.xpath("//div[@class='cont_list']/ul/li[2]//a/text()")
date_str = date_list[0]
date_y = int(date_str[0:date_str.rfind("月")])
date_r = int(date_str[date_str.rfind("月")+1:date_str.rfind("日")])
# date_time_str = '2020-' + date_y + "-" + date_r
# date_time = datetime.datetime.strptime(date_time_str , "%Y-%m-%d")
date_time = datetime.datetime(2020,date_y,date_r)
return url_list[0],date_time
def valid_date(self,date_time):
# 获取时间与当前时间对比
now_time = datetime.datetime.now().strftime('%Y-%m-%d')
e_time = date_time.strftime('%Y-%m-%d') #转换成字符串格式
# 将日期转化为秒,
s_now_time = time.mktime(time.strptime(now_time, "%Y-%m-%d"))
s_e_time = time.mktime(time.strptime(e_time, "%Y-%m-%d"))
diff = int(s_now_time) - int(s_e_time)
if diff == 86400 :
return True
else:
return False
def get_content(self,url):
res = self.parse_url(url)
html = etree.HTML(res)
content_list = html.xpath("//div[@class='zoom_box']/p//text()")
content = '\n'.join(content_list)
return content
def run(self):
content_url,date_time = self.get_url_list()
content = self.get_content(content_url)
print(date_time,content)
if self.valid_date(date_time) :
print("时间相同")
else:
print("时间不同")
#1、获取URL列表
#2、遍历列表,获取数据
#3、保存数据
if __name__ == '__main__':
yqtb = Yqtb()
yqtb.run()