Python爬取综合信息(地震信息,天气,气象新闻,热点新闻,小时降雨量前30,气象预警)
前置条件
1.使用python3(python3.6及以上)
2.需要安装request库 pip install requests
3.需要安装fake_useragent pip install fake_useragent
4.需要安装谷歌浏览器已经对应的版本驱动
https://blog.csdn.net/ifubing/article/details/104462714
5.安装selenium库 pip install selenium
6.使用的是mysql数据库 安装数据库连接用的pymysql库 pip install pymysql
7.安装分析网页用的lxml库 pip install lxml
8.安装定时任务库 pip install apscheduler
实现代码
代码比较简单 需要修改的地方有:
1.数据库连接
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
2.热点新闻的对象,在凤凰网搜索后将连接放到代码里,以及关键词的修改
3.需要爬取的天气,在天气配置文件中修改
# coding=UTF-8
# author : Nanfu
# date : 2021/6/11 13:54
import requests
from fake_useragent import UserAgent
from selenium import webdriver
from lxml import etree
import json
import time
import datetime
import pymysql
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
import logging
#1.天气类
class Weather:
def get_urls(self):
filename='F://pythonProject//demon1//day06//province.json'
f_obj=open(filename,'rb')
urls=json.load(f_obj)
return urls
def weather_info(self,url_obj):
url=url_obj['url']
type=url_obj['type']
value=url_obj['value']
pname=url_obj['pname']
data=[]
response=requests.get(url=url, headers={'User-Agent': UserAgent().random})
response.encoding = 'utf-8'
html=response.text
if html.find("<"+pname+" dn=\"day\">")>-1:
result=html.replace("<"+pname+" dn=\"day\">","").replace("</"+pname+">","").replace("<city ","").replace(" an","")
else:
result = html.replace("<" + pname + " dn=\"nay\">", "").replace("</" + pname + ">", "").replace("<city ","").replace(" an", "")
weather_datas = result.split("/>")
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
db_result=db.select_weather(pname)
if db_result[0][0] == 0:
for weather_data in weather_datas:
if weather_data.find("stateDetailed")>-1:
cityData = weather_data.split(" ")
if type=="0":
province = cityData[0].replace("quName=", "").replace("\"", "").replace("\n", "").replace("\r", "")
state1 = cityData[3].replace("state1=", "").replace("\"", "")
state2 = cityData[4].replace("state2=", "").replace("\"", "")
stateDetailed = cityData[5].replace("stateDetailed=", "").replace("\"", "")
windState = cityData[8].replace("windState=", "").replace("\"", "")
else:
province = cityData[2].replace("cityname=", "").replace("\"", "")
state1 = cityData[6].replace("state1=", "").replace("\"", "")
state2 = cityData[7].replace("state2=", "").replace("\"", "")
stateDetailed = cityData[8].replace("stateDetailed=", "").replace("\"", "")
windState = cityData[12].replace("windState=", "").replace("\"", "")
weather_info={"province": province, "pname": pname, "type": type, "state1": state1, "state2": state2,
"stateDetailed": stateDetailed, "windState": windState}
if weather_info not in data:
data.append(weather_info)
db.close_db()
return data
# 插入天气数据
def weather_insert_db(self,weather_datas):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
for weather_data in weather_datas:
province=weather_data['province']
pname=weather_data['pname']
type=weather_data['type']
state1=weather_data['state1']
state2=weather_data['state2']
stateDetailed=weather_data['stateDetailed']
windState=weather_data['windState']
db.insert_weather(province, pname, type, state1, state2, stateDetailed, windState)
db.commit()
db.close_db()
#2.气象新闻类
class WeatherReport:
def weather_report_info(self,url):
response=requests.get(url)
response.encoding='utf-8'
html=response.text
html=html[html.find(":")+1:html.rfind("}")]
wr_entitys=json.loads(html)
return wr_entitys
def wr_insert(self,wr_entitys,keywords):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
zq_type=2
for wr_entity in wr_entitys:
zq_title=str(wr_entity['c1'])
flag=False
for keyword in keywords:
if zq_title.find(keyword)>-1:
flag=True
break
if flag:
zq_content = str(wr_entity['c16'])
eq_time=str(wr_entity['c5'])+" "+str(wr_entity['c6'])
db.insert_wr(zq_time,zq_type,zq_title,zq_content,eq_time)
db.commit()
db.close_db()
#3.地震类
class Earthquake:
# 解析属性
def eq_info(self,url):
response = requests.get(
url=url,
headers={'User-Agent': UserAgent().random},
params={"region": "1", "areaname": "", "dateType": "2", "magnitude": "0"})
response.encoding = 'utf-8'
html = response.text
eq_obj = json.loads(html)
data=[]
for eq in eq_obj:
# 地址
address = str(eq['epicenter'])
# 地震时间
eq_time = str(eq['orig_time'])
# 纬度
lttd = str(eq['latitudes'])
# 经度
lgtd = str(eq['longitudes'])
# 震级
magnitude = str(eq['num_mag'])
# 深度
eq_length = str(eq['depth'])
# 总结
zq_content='['+magnitude+'级]'+address+' '+eq_time+'(纬度:'+lttd+'经度:'+lgtd+')发生地震'+' 震源深度:'+magnitude+'km'
if self.is_week_ago(eq_time):
# print('地址:' + address, '地震时间:' + eq_time, '纬度:' + lttd, '经度:' + lgtd, '震级:' + magnitude,
# '深度:' + eq_length)
eq_entity={"address":address,"eq_time":eq_time,"lttd":lttd,"lgtd":lgtd,"magnitude":magnitude,"eq_length":eq_length,"zq_content":zq_content,'zq_type':'3'}
data.append(eq_entity)
else:
break
return data
# 入库
def eq_into_db(self,data):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
zq_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
for eq_entity in data:
result=db.select_zq_content(eq_entity['zq_content'])
if result[0][0] == 0 :
db.insert_eq(eq_entity,zq_time)
db.commit()
db.close_db()
# 是否一周内时间
def is_week_ago(self,date):
timeArray = time.strptime(date, "%Y-%m-%d %H:%M:%S")
dateStamp = int(time.mktime(timeArray))
# 先获得时间数组格式的日期
sevenDayAgo = (datetime.datetime.now() - datetime.timedelta(days=7))
# 转换为时间戳
sevenStamp = int(time.mktime(sevenDayAgo.timetuple()))
return dateStamp > sevenStamp
#5.气象预警类
class Warning:
# 解析天气预警信息
def warn_info(self,url):
data=[]
chrome.get(url)
html = chrome.page_source
e=etree.HTML(html)
warn_infos=e.xpath('//div[@class="dDisasterAlarm"]/ul[@class="dDUl"]/li/a/text()')
warn_times=e.xpath('//div[@class="dDisasterAlarm"]/ul[@class="dDUl"]/li/span[@class="dTime"]/text()')
for warn_info,warn_time in zip(warn_infos,warn_times):
str1=str(warn_info).split("发布")[0]
str2=str(warn_info).split("发布")[1]
warning_level=str2[-4:]
warning_type=str2[:-4]
if str1.find('省')>-1:
city = str1.split('省')[1]
address = str1.split('省')[0]+'省'
elif str1.find('自治区')>-1:
city = str1.split('自治区')[1]
address = str1.split('自治区')[0]+'自治区'
elif str1.find('自治州')>-1:
city = str1.split('自治州')[1]
address = str1.split('自治州')[0]+'自治州'
elif str1.find('市') > -1:
city = str1.split('市')[1]
address = str1.split('市')[0]+'市'
zq_type=5
warn_entity={"zq_content":warn_info,"warning_level":warning_level,"warning_type":warning_type,"city":city,"address":address,"zq_type":zq_type,"eq_time":warn_time}
data.append(warn_entity)
return data
def warn_info_insert(self,data):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
for warn_entity in data:
result = db.select_zq_content(warn_entity['zq_content'])
if result[0][0] == 0:
db.insert_warn(warn_entity,zq_time)
db.commit()
db.close_db()
#6小时降雨量排名前30
class RainOrder:
def rain_info(self,url):
data = []
options = webdriver.ChromeOptions()
options.add_argument('--headless')
chrome = webdriver.Chrome(chrome_options=options)
chrome.get(url)
html = chrome.page_source
e = etree.HTML(html)
# 降雨量
rain_totals=e.xpath('//div[@id="phList"]//div[@class="col-xs-4 text-right"]/text()')
# 区县
counties=e.xpath('//div[@id="phList"]//div[@class="col-xs-8"]/span[1]/text()')
# 省份
provinces=e.xpath('//div[@id="phList"]//div[@class="col-xs-8"]/span[2]/text()')
# 时间
time=e.xpath('//div[@id="time_0"]/div/text()')
for rain_total,county,province in zip(rain_totals,counties,provinces):
county=county.split(" ")[1]
rain_total=rain_total.split('mm')[0]
rain_entity={"county":county,"rain_total":rain_total,"province":province,"time":time}
data.append(rain_entity)
return data
def rain_info_insert(self,rain_datas):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
for rain_data in rain_datas:
county=rain_data['county']
rain_total=rain_data['rain_total']
province=rain_data['province']
rain_time=rain_data['time']
db.insert_rainOrder(county,rain_total,province,rain_time,zq_time)
db.commit()
db.close_db()
# 4.热点新闻
class HotNews:
def ht_info(self,url):
chrome.get(url)
time.sleep(2)
js = 'document.documentElement.scrollTop=10000'
chrome.execute_script(js)
time.sleep(2)
html = chrome.page_source
e = etree.HTML(html)
data=[]
zq_contents=e.xpath('//ul[@class="news-stream-basic-news-list"]/li/a/@title')
for zq_content in zq_contents:
zq_content = zq_content.replace('<em>', '')
zq_content=zq_content.replace('</em>','')
data.append(zq_content)
return data
def hn_insert(self,data,hn_keywords):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
zq_type=4
for zq_content in data:
for hn_keyword in hn_keywords:
if zq_content.find(hn_keyword)>-1:
result = db.select_zq_content(zq_content)
if result[0][0] == 0:
db.insert_hownews(zq_content,zq_time,zq_type)
db.commit()
db.close_db()
# 数据库链接及操作
class DataBaseLink:
def __init__(self,host, port, user, password, db, charset='utf8'):
self.__host = host
self.__port = port # mysql端口
self.__username = user # mysql远程连接用户名
self.__password = password # mysql远程连接密码
self.__db = db # mysql使用的数据库名
self.__charset = charset # mysql使用的字符编码,默认为utf8
try:
self.__connect_database =pymysql.connect(host=self.__host, port=self.__port, user=self.__username,
password=self.__password, db=self.__db, charset=self.__charset)
except:
print('连接失败')
# 查看地震/预警重复数据
def select_zq_content(self,zq_content):
sql='SELECT COUNT(1) as count FROM comprehensive WHERE zq_content=%s'
data=None
try:
cur=self.__connect_database.cursor()
cur.execute(sql,zq_content)
data=cur.fetchall()
except Exception as e:
print('查询失败', e)
data = sql + '查询失败'
return data
def select_weather(self,province_alias):
sql = "SELECT COUNT(1) FROM weather WHERE DATE_FORMAT(create_time,'%%Y-%%m-%%d %%H')=DATE_FORMAT(Now(),'%%Y-%%m-%%d %%H') and province_alias=%s"
data = None
try:
cur = self.__connect_database.cursor()
cur.execute(sql, province_alias)
data = cur.fetchall()
except Exception as e:
print('查询失败', e)
data = sql + '查询失败'
return data
# 插入地震信息
def insert_eq(self,eq,zq_time):
sql='insert into comprehensive (zq_content,zq_type,eq_time,zq_tm,address,magnitude,lgtd,lttd,eq_length) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
result=True
try:
cur=self.__connect_database.cursor()
zq_content=eq['zq_content']
zq_type=eq['zq_type']
eq_time=eq['eq_time']
address=eq['address']
magnitude=eq['magnitude']
lgtd=eq['lgtd']
lttd=eq['lttd']
eq_length=eq['eq_length']
cur.execute(sql,[zq_content,zq_type,eq_time,zq_time,address,magnitude,lgtd,lttd,eq_length])
except Exception as e:
print('插入失败', e)
result = sql + '插入失败'
self.__connect_database.rollback()
return result
# 插入气象预警信息
def insert_warn(self,warn,zq_time):
sql = 'insert into comprehensive (zq_content,zq_tm,warning_level,warning_type,city,address,zq_type,eq_time) values (%s,%s,%s,%s,%s,%s,%s,%s)'
result = True
try:
cur = self.__connect_database.cursor()
zq_content = warn['zq_content']
warning_level = warn['warning_level']
warning_type = warn['warning_type']
address = warn['address']
city = warn['city']
zq_type = warn['zq_type']
eq_time = warn['eq_time']
cur.execute(sql, [zq_content,zq_time,warning_level,warning_type,city,address,zq_type,eq_time])
except Exception as e:
print('插入失败', e)
result = sql + '插入失败'
self.__connect_database.rollback()
return result
# 插入天气信息
def insert_weather(self,province,pname, type, state1, state2, stateDetailed, windState):
sql = "insert into weather (province,province_alias,`type`,state1,state2,state_detailed,wind_state) values (%s,%s,%s,%s,%s,%s,%s)"
result = True
try:
cur = self.__connect_database.cursor()
cur.execute(sql, [province, pname, type, state1, state2, stateDetailed, windState])
except Exception as e:
print('插入失败', e)
result = sql + '插入失败'
self.__connect_database.rollback()
return result
#插入降雨量排行Top30
def insert_rainOrder(self,county,rain_total,province,rain_time,zq_time):
sql = "insert into rain (pname,`name`,`time`,create_time,rain_num) values (%s,%s,%s,%s,%s)"
result = True
try:
cur = self.__connect_database.cursor()
cur.execute(sql, [province, county, rain_time, zq_time, rain_total])
except Exception as e:
print('插入失败', e)
result = sql + '插入失败'
self.__connect_database.rollback()
return result
#插入气象新闻
def insert_wr(self, zq_time,zq_type,zq_title,zq_content,eq_time):
sql = "insert into comprehensive (zq_title,zq_type,eq_time,zq_tm,zq_content) values (%s,%s,%s,%s,%s)"
result = True
try:
cur = self.__connect_database.cursor()
cur.execute(sql, [zq_title, zq_type, eq_time, zq_time, zq_content])
except Exception as e:
print('插入失败', e)
result = sql + '插入失败'
self.__connect_database.rollback()
return result
# 插入热点新闻
def insert_hownews(self,zq_content,zq_time,zq_type):
sql = "insert into comprehensive (zq_content,zq_type,eq_time,zq_tm) values (%s,%s,%s,%s)"
result = True
try:
cur = self.__connect_database.cursor()
cur.execute(sql, [zq_content, zq_type, zq_time, zq_time])
except Exception as e:
print('插入失败', e)
result = sql + '插入失败'
self.__connect_database.rollback()
return result
# 删除过期信息
def deleteInfo(self,sevenday,threeday):
sql = "delete from comprehensive where zq_tm < %s"
sql1 = "delete from weather where create_time < %s"
sql2 = "delete from rain where create_time < %s"
result = True
try:
cur = self.__connect_database.cursor()
cur.execute(sql, [sevenday])
cur.execute(sql1, [threeday])
cur.execute(sql2, [threeday])
except Exception as e:
print('删除失败', e)
result = sql + '删除失败'
self.__connect_database.rollback()
return result
def commit(self):
self.__connect_database.commit()
# 关闭数据库连接
def close_db(self):
self.__connect_database.close()
class Task:
def weather_task(self):
# 1.陕西天气信息
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'正在爬取天气信息...')
for weather_url in weather_urls:
weather_data = weather.weather_info(weather_url)
if weather_data != None:
weather.weather_insert_db(weather_data)
def wr_task(self):
# 2.天气新闻采集
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取天气新闻...')
wr_keywords = '台风|热带风暴|强热带风暴|暴雨|暴雪|雷暴|冰雹|大风|沙尘|龙卷风|洪涝|高温|山洪|地质灾害|干旱|大雪'.split('|')
wr_entitys = wr.weather_report_info(weather_report_url)
wr.wr_insert(wr_entitys, wr_keywords)
def earthquake_task(self):
# 3.地震信息
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取地震信息...')
eq_data = eq.eq_info(earthquake_url)
eq.eq_into_db(eq_data)
def hotnews_task(self):
# 4.热点新闻
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取热点新闻...')
hn_keywords = '联通网络|基站|联通5G|通信|运营商|中国联通|大雪|暴雪'.split('|')
hn_data = hotnews.ht_info(hotnews_url)
hotnews.hn_insert(hn_data, hn_keywords)
def warn_task(self):
# 5.气象预警
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取气象预警...')
warn_data = warn.warn_info(warnning_url)
warn.warn_info_insert(warn_data)
def rain_task(self):
# 6小时降雨量Top30
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取降雨量Top30...')
rain_datas = rain.rain_info(rain_url)
rain.rain_info_insert(rain_datas)
def clean_data(self):
db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在清除过期数据...')
# 删除七天前的数据已经删除三天前的数据
sevenDaysAgo = (datetime.datetime.now() - datetime.timedelta(days=7))
threeDaysAgo = (datetime.datetime.now() - datetime.timedelta(days=3))
db.deleteInfo(sevenDaysAgo, threeDaysAgo)
db.close_db()
def my_listener(self,event):
if event.exception:
print('任务出错了!!!!!!')
else:
print('任务照常运行...')
if __name__ == '__main__':
weather_report_url = 'http://www.weather.com.cn/pubm/news2019_more_list10.htm'
rain_url="http://www.nmc.cn/publish/observations/6hour-precipitation.html"
warnning_url='http://www.weather.com.cn/alarm/warninglist1.shtml'
earthquake_url='https://www.cea.gov.cn/eportal/ui?struts.portlet.mode=view&struts.portlet.action=/portlet/expressEarthquake!queryExpressEarthquakeList.action&pageId=363409&moduleId=a852ba487b534470a84a30f00e7d6670'
# 热点新闻 网址为凤凰网搜索 关键字 后的网址
hotnews_url='http://so.ifeng.com/?q=%E4%B8%AD%E5%9B%BD%E8%81%94%E9%80%9A&c=1c'
# 初始化谷歌浏览器
options = webdriver.ChromeOptions()
options.add_argument('--headless')
chrome = webdriver.Chrome(chrome_options=options)
# 初始化对象
eq = Earthquake()
wr = WeatherReport()
rain = RainOrder()
hotnews = HotNews()
warn = Warning()
weather = Weather()
weather_urls = weather.get_urls()
# 创建定时器
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='log1.txt',
filemode='a')
task = Task()
task.wr_task()
task.weather_task()
task.rain_task()
task.hotnews_task()
task.warn_task()
task.earthquake_task()
task.clean_data()
scheduler = BlockingScheduler()
scheduler.add_job(func=task.weather_task, trigger='interval', minutes=45)
scheduler.add_job(func=task.wr_task, trigger='interval', hours=24)
scheduler.add_job(func=task.rain_task, trigger='interval', hours=24)
scheduler.add_job(func=task.warn_task, trigger='interval', hours=3)
scheduler.add_job(func=task.hotnews_task, trigger='interval', hours=12)
scheduler.add_job(func=task.earthquake_task, trigger='interval', hours=24)
scheduler.add_job(func=task.clean_data, trigger='interval', hours=24)
scheduler._logger=logging
scheduler.add_listener(task.my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
scheduler.start()
# #关闭数据库
# db.close_db()
# # 关闭浏览器
# chrome.close()
数据库表和天气配置表可以看https://blog.csdn.net/qq_48663998/article/details/118047946,谢谢大家,点个赞吧。