Python爬取综合信息(地震信息,天气,气象新闻,热点新闻,小时降雨量前30,气象预警)

Python爬取综合信息(地震信息,天气,气象新闻,热点新闻,小时降雨量前30,气象预警)

前置条件

1.使用python3(python3.6及以上)
2.需要安装request库  pip install requests
3.需要安装fake_useragent pip install fake_useragent
4.需要安装谷歌浏览器已经对应的版本驱动
	https://blog.csdn.net/ifubing/article/details/104462714
5.安装selenium库 pip install selenium
6.使用的是mysql数据库 安装数据库连接用的pymysql库 pip install pymysql
7.安装分析网页用的lxml库 pip install lxml
8.安装定时任务库 pip install apscheduler

实现代码

代码比较简单 需要修改的地方有:
1.数据库连接

db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')

2.热点新闻的对象,在凤凰网搜索后将连接放到代码里,以及关键词的修改
在这里插入图片描述
3.需要爬取的天气,在天气配置文件中修改

# coding=UTF-8
# author : Nanfu
# date   : 2021/6/11 13:54
import requests
from fake_useragent import UserAgent
from selenium import webdriver
from lxml import etree
import json
import time
import datetime
import pymysql
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
import logging



#1.天气类
class Weather:
    def get_urls(self):
        filename='F://pythonProject//demon1//day06//province.json'
        f_obj=open(filename,'rb')
        urls=json.load(f_obj)
        return urls

    def weather_info(self,url_obj):
        url=url_obj['url']
        type=url_obj['type']
        value=url_obj['value']
        pname=url_obj['pname']
        data=[]
        response=requests.get(url=url, headers={'User-Agent': UserAgent().random})
        response.encoding = 'utf-8'
        html=response.text
        if html.find("<"+pname+" dn=\"day\">")>-1:
            result=html.replace("<"+pname+" dn=\"day\">","").replace("</"+pname+">","").replace("<city ","").replace(" an","")
        else:
            result = html.replace("<" + pname + " dn=\"nay\">", "").replace("</" + pname + ">", "").replace("<city ","").replace(" an", "")
        weather_datas = result.split("/>")
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        db_result=db.select_weather(pname)
        if db_result[0][0] == 0:
            for weather_data in weather_datas:
                if weather_data.find("stateDetailed")>-1:
                    cityData = weather_data.split(" ")
                if type=="0":
                        province = cityData[0].replace("quName=", "").replace("\"", "").replace("\n", "").replace("\r", "")
                        state1 = cityData[3].replace("state1=", "").replace("\"", "")
                        state2 = cityData[4].replace("state2=", "").replace("\"", "")
                        stateDetailed = cityData[5].replace("stateDetailed=", "").replace("\"", "")
                        windState = cityData[8].replace("windState=", "").replace("\"", "")
                else:
                        province = cityData[2].replace("cityname=", "").replace("\"", "")
                        state1 = cityData[6].replace("state1=", "").replace("\"", "")
                        state2 = cityData[7].replace("state2=", "").replace("\"", "")
                        stateDetailed = cityData[8].replace("stateDetailed=", "").replace("\"", "")
                        windState = cityData[12].replace("windState=", "").replace("\"", "")

                weather_info={"province": province, "pname": pname, "type": type, "state1": state1, "state2": state2,
                 "stateDetailed": stateDetailed, "windState": windState}
                if weather_info not in data:
                    data.append(weather_info)
        db.close_db()
        return data
    # 插入天气数据
    def weather_insert_db(self,weather_datas):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        for weather_data in weather_datas:
            province=weather_data['province']
            pname=weather_data['pname']
            type=weather_data['type']
            state1=weather_data['state1']
            state2=weather_data['state2']
            stateDetailed=weather_data['stateDetailed']
            windState=weather_data['windState']
            db.insert_weather(province, pname, type, state1, state2, stateDetailed, windState)
            db.commit()
        db.close_db()
#2.气象新闻类
class WeatherReport:
    def weather_report_info(self,url):
        response=requests.get(url)
        response.encoding='utf-8'
        html=response.text
        html=html[html.find(":")+1:html.rfind("}")]
        wr_entitys=json.loads(html)
        return wr_entitys
    def wr_insert(self,wr_entitys,keywords):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        zq_type=2
        for wr_entity in wr_entitys:
            zq_title=str(wr_entity['c1'])
            flag=False
            for keyword in keywords:
                if zq_title.find(keyword)>-1:
                    flag=True
                    break
            if flag:
                zq_content = str(wr_entity['c16'])
                eq_time=str(wr_entity['c5'])+" "+str(wr_entity['c6'])
                db.insert_wr(zq_time,zq_type,zq_title,zq_content,eq_time)
                db.commit()
        db.close_db()
#3.地震类
class Earthquake:
    # 解析属性
    def eq_info(self,url):
        response = requests.get(
            url=url,
            headers={'User-Agent': UserAgent().random},
            params={"region": "1", "areaname": "", "dateType": "2", "magnitude": "0"})
        response.encoding = 'utf-8'
        html = response.text
        eq_obj = json.loads(html)
        data=[]
        for eq in eq_obj:
            # 地址
            address = str(eq['epicenter'])
            # 地震时间
            eq_time = str(eq['orig_time'])
            # 纬度
            lttd = str(eq['latitudes'])
            # 经度
            lgtd = str(eq['longitudes'])
            # 震级
            magnitude = str(eq['num_mag'])
            # 深度
            eq_length = str(eq['depth'])
            # 总结
            zq_content='['+magnitude+'级]'+address+' '+eq_time+'(纬度:'+lttd+'经度:'+lgtd+')发生地震'+' 震源深度:'+magnitude+'km'
            if self.is_week_ago(eq_time):
                # print('地址:' + address, '地震时间:' + eq_time, '纬度:' + lttd, '经度:' + lgtd, '震级:' + magnitude,
                #       '深度:' + eq_length)
                eq_entity={"address":address,"eq_time":eq_time,"lttd":lttd,"lgtd":lgtd,"magnitude":magnitude,"eq_length":eq_length,"zq_content":zq_content,'zq_type':'3'}
                data.append(eq_entity)
            else:
                break
        return data
    # 入库
    def eq_into_db(self,data):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        zq_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        for eq_entity in data:
            result=db.select_zq_content(eq_entity['zq_content'])
            if result[0][0] == 0 :
                db.insert_eq(eq_entity,zq_time)
                db.commit()
        db.close_db()


    # 是否一周内时间
    def is_week_ago(self,date):
        timeArray = time.strptime(date, "%Y-%m-%d %H:%M:%S")
        dateStamp = int(time.mktime(timeArray))
        # 先获得时间数组格式的日期
        sevenDayAgo = (datetime.datetime.now() - datetime.timedelta(days=7))
        # 转换为时间戳
        sevenStamp = int(time.mktime(sevenDayAgo.timetuple()))
        return dateStamp > sevenStamp
#5.气象预警类
class Warning:
    # 解析天气预警信息
    def warn_info(self,url):
        data=[]
        chrome.get(url)
        html = chrome.page_source
        e=etree.HTML(html)
        warn_infos=e.xpath('//div[@class="dDisasterAlarm"]/ul[@class="dDUl"]/li/a/text()')
        warn_times=e.xpath('//div[@class="dDisasterAlarm"]/ul[@class="dDUl"]/li/span[@class="dTime"]/text()')
        for warn_info,warn_time in zip(warn_infos,warn_times):
            str1=str(warn_info).split("发布")[0]
            str2=str(warn_info).split("发布")[1]
            warning_level=str2[-4:]
            warning_type=str2[:-4]
            if str1.find('省')>-1:
                city = str1.split('省')[1]
                address = str1.split('省')[0]+'省'
            elif str1.find('自治区')>-1:
                city = str1.split('自治区')[1]
                address = str1.split('自治区')[0]+'自治区'
            elif str1.find('自治州')>-1:
                city = str1.split('自治州')[1]
                address = str1.split('自治州')[0]+'自治州'
            elif str1.find('市') > -1:
                city = str1.split('市')[1]
                address = str1.split('市')[0]+'市'
            zq_type=5
            warn_entity={"zq_content":warn_info,"warning_level":warning_level,"warning_type":warning_type,"city":city,"address":address,"zq_type":zq_type,"eq_time":warn_time}
            data.append(warn_entity)
        return data

    def warn_info_insert(self,data):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        for warn_entity in data:
            result = db.select_zq_content(warn_entity['zq_content'])
            if result[0][0] == 0:
                db.insert_warn(warn_entity,zq_time)
                db.commit()
        db.close_db()
#6小时降雨量排名前30
class RainOrder:
    def rain_info(self,url):
        data = []
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        chrome = webdriver.Chrome(chrome_options=options)
        chrome.get(url)
        html = chrome.page_source
        e = etree.HTML(html)
        # 降雨量
        rain_totals=e.xpath('//div[@id="phList"]//div[@class="col-xs-4 text-right"]/text()')
        # 区县
        counties=e.xpath('//div[@id="phList"]//div[@class="col-xs-8"]/span[1]/text()')
        #  省份
        provinces=e.xpath('//div[@id="phList"]//div[@class="col-xs-8"]/span[2]/text()')
        # 时间
        time=e.xpath('//div[@id="time_0"]/div/text()')
        for rain_total,county,province in zip(rain_totals,counties,provinces):
            county=county.split(" ")[1]
            rain_total=rain_total.split('mm')[0]
            rain_entity={"county":county,"rain_total":rain_total,"province":province,"time":time}
            data.append(rain_entity)
        return data
    def rain_info_insert(self,rain_datas):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        for rain_data in rain_datas:
            county=rain_data['county']
            rain_total=rain_data['rain_total']
            province=rain_data['province']
            rain_time=rain_data['time']
            db.insert_rainOrder(county,rain_total,province,rain_time,zq_time)
            db.commit()
        db.close_db()
# 4.热点新闻
class HotNews:
    def ht_info(self,url):
        chrome.get(url)
        time.sleep(2)
        js = 'document.documentElement.scrollTop=10000'
        chrome.execute_script(js)
        time.sleep(2)
        html = chrome.page_source

        e = etree.HTML(html)
        data=[]
        zq_contents=e.xpath('//ul[@class="news-stream-basic-news-list"]/li/a/@title')
        for zq_content in zq_contents:
            zq_content = zq_content.replace('<em>', '')
            zq_content=zq_content.replace('</em>','')
            data.append(zq_content)
        return data
    def hn_insert(self,data,hn_keywords):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        zq_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        zq_type=4
        for zq_content in data:
            for hn_keyword in hn_keywords:
                if zq_content.find(hn_keyword)>-1:
                    result = db.select_zq_content(zq_content)
                    if result[0][0] == 0:
                        db.insert_hownews(zq_content,zq_time,zq_type)
                        db.commit()
        db.close_db()
# 数据库链接及操作
class DataBaseLink:
    def __init__(self,host, port, user, password, db, charset='utf8'):
        self.__host = host
        self.__port = port  # mysql端口
        self.__username = user  # mysql远程连接用户名
        self.__password = password  # mysql远程连接密码
        self.__db = db  # mysql使用的数据库名
        self.__charset = charset  # mysql使用的字符编码,默认为utf8
        try:
            self.__connect_database =pymysql.connect(host=self.__host, port=self.__port, user=self.__username,
                                              password=self.__password, db=self.__db, charset=self.__charset)
        except:
            print('连接失败')
    # 查看地震/预警重复数据
    def select_zq_content(self,zq_content):
        sql='SELECT COUNT(1) as count FROM comprehensive WHERE zq_content=%s'
        data=None
        try:
            cur=self.__connect_database.cursor()
            cur.execute(sql,zq_content)
            data=cur.fetchall()
        except Exception as e:
            print('查询失败', e)
            data = sql + '查询失败'
        return data
    def select_weather(self,province_alias):
       sql = "SELECT COUNT(1) FROM weather WHERE DATE_FORMAT(create_time,'%%Y-%%m-%%d %%H')=DATE_FORMAT(Now(),'%%Y-%%m-%%d %%H') and province_alias=%s"
       data = None
       try:
           cur = self.__connect_database.cursor()
           cur.execute(sql, province_alias)
           data = cur.fetchall()
       except Exception as e:
           print('查询失败', e)
           data = sql + '查询失败'
       return data
    # 插入地震信息
    def insert_eq(self,eq,zq_time):
        sql='insert into comprehensive (zq_content,zq_type,eq_time,zq_tm,address,magnitude,lgtd,lttd,eq_length) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        result=True
        try:
            cur=self.__connect_database.cursor()
            zq_content=eq['zq_content']
            zq_type=eq['zq_type']
            eq_time=eq['eq_time']
            address=eq['address']
            magnitude=eq['magnitude']
            lgtd=eq['lgtd']
            lttd=eq['lttd']
            eq_length=eq['eq_length']
            cur.execute(sql,[zq_content,zq_type,eq_time,zq_time,address,magnitude,lgtd,lttd,eq_length])
        except Exception as e:
            print('插入失败', e)
            result = sql + '插入失败'
            self.__connect_database.rollback()
        return result
    # 插入气象预警信息
    def insert_warn(self,warn,zq_time):
        sql = 'insert into comprehensive (zq_content,zq_tm,warning_level,warning_type,city,address,zq_type,eq_time) values (%s,%s,%s,%s,%s,%s,%s,%s)'
        result = True
        try:
            cur = self.__connect_database.cursor()
            zq_content = warn['zq_content']
            warning_level = warn['warning_level']
            warning_type = warn['warning_type']
            address = warn['address']
            city = warn['city']
            zq_type = warn['zq_type']
            eq_time = warn['eq_time']
            cur.execute(sql, [zq_content,zq_time,warning_level,warning_type,city,address,zq_type,eq_time])
        except Exception as e:
            print('插入失败', e)
            result = sql + '插入失败'
            self.__connect_database.rollback()
        return result
    # 插入天气信息
    def insert_weather(self,province,pname, type, state1, state2, stateDetailed, windState):
        sql = "insert into weather (province,province_alias,`type`,state1,state2,state_detailed,wind_state) values (%s,%s,%s,%s,%s,%s,%s)"
        result = True
        try:
            cur = self.__connect_database.cursor()
            cur.execute(sql, [province, pname, type, state1, state2, stateDetailed, windState])
        except Exception as e:
            print('插入失败', e)
            result = sql + '插入失败'
            self.__connect_database.rollback()
        return result
    #插入降雨量排行Top30
    def insert_rainOrder(self,county,rain_total,province,rain_time,zq_time):
        sql = "insert into rain (pname,`name`,`time`,create_time,rain_num)  values (%s,%s,%s,%s,%s)"
        result = True
        try:
            cur = self.__connect_database.cursor()
            cur.execute(sql, [province, county, rain_time, zq_time, rain_total])
        except Exception as e:
            print('插入失败', e)
            result = sql + '插入失败'
            self.__connect_database.rollback()
        return result
    #插入气象新闻
    def insert_wr(self, zq_time,zq_type,zq_title,zq_content,eq_time):
        sql = "insert into comprehensive (zq_title,zq_type,eq_time,zq_tm,zq_content)  values (%s,%s,%s,%s,%s)"
        result = True
        try:
            cur = self.__connect_database.cursor()
            cur.execute(sql, [zq_title, zq_type, eq_time, zq_time, zq_content])
        except Exception as e:
            print('插入失败', e)
            result = sql + '插入失败'
            self.__connect_database.rollback()
        return result
    # 插入热点新闻
    def insert_hownews(self,zq_content,zq_time,zq_type):
        sql = "insert into comprehensive (zq_content,zq_type,eq_time,zq_tm) values (%s,%s,%s,%s)"
        result = True
        try:
            cur = self.__connect_database.cursor()
            cur.execute(sql, [zq_content, zq_type, zq_time, zq_time])
        except Exception as e:
            print('插入失败', e)
            result = sql + '插入失败'
            self.__connect_database.rollback()
        return result
    # 删除过期信息
    def deleteInfo(self,sevenday,threeday):
        sql = "delete  from comprehensive where zq_tm < %s"
        sql1 = "delete  from weather where create_time < %s"
        sql2 = "delete  from rain where create_time < %s"
        result = True
        try:
            cur = self.__connect_database.cursor()
            cur.execute(sql, [sevenday])
            cur.execute(sql1, [threeday])
            cur.execute(sql2, [threeday])
        except Exception as e:
            print('删除失败', e)
            result = sql + '删除失败'
            self.__connect_database.rollback()
        return result

    def commit(self):
        self.__connect_database.commit()
    # 关闭数据库连接
    def close_db(self):
        self.__connect_database.close()

class Task:
    def weather_task(self):
        # 1.陕西天气信息
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'正在爬取天气信息...')
        for weather_url in weather_urls:
            weather_data = weather.weather_info(weather_url)
            if weather_data != None:
                weather.weather_insert_db(weather_data)
    def wr_task(self):
        # 2.天气新闻采集
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取天气新闻...')
        wr_keywords = '台风|热带风暴|强热带风暴|暴雨|暴雪|雷暴|冰雹|大风|沙尘|龙卷风|洪涝|高温|山洪|地质灾害|干旱|大雪'.split('|')
        wr_entitys = wr.weather_report_info(weather_report_url)
        wr.wr_insert(wr_entitys, wr_keywords)
    def earthquake_task(self):
        # 3.地震信息
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取地震信息...')
        eq_data = eq.eq_info(earthquake_url)
        eq.eq_into_db(eq_data)
    def hotnews_task(self):
        # 4.热点新闻
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取热点新闻...')
        hn_keywords = '联通网络|基站|联通5G|通信|运营商|中国联通|大雪|暴雪'.split('|')
        hn_data = hotnews.ht_info(hotnews_url)
        hotnews.hn_insert(hn_data, hn_keywords)
    def warn_task(self):
        # 5.气象预警
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取气象预警...')

        warn_data = warn.warn_info(warnning_url)
        warn.warn_info_insert(warn_data)
    def rain_task(self):
        # 6小时降雨量Top30
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在爬取降雨量Top30...')
        rain_datas = rain.rain_info(rain_url)
        rain.rain_info_insert(rain_datas)
    def clean_data(self):
        db = DataBaseLink(host="127.0.0.1", port=3306, user='root', password='java', db='school')
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '正在清除过期数据...')
        # 删除七天前的数据已经删除三天前的数据
        sevenDaysAgo = (datetime.datetime.now() - datetime.timedelta(days=7))
        threeDaysAgo = (datetime.datetime.now() - datetime.timedelta(days=3))
        db.deleteInfo(sevenDaysAgo, threeDaysAgo)
        db.close_db()
    def my_listener(self,event):
        if event.exception:
            print('任务出错了!!!!!!')
        else:
            print('任务照常运行...')
if __name__ == '__main__':
    weather_report_url = 'http://www.weather.com.cn/pubm/news2019_more_list10.htm'
    rain_url="http://www.nmc.cn/publish/observations/6hour-precipitation.html"
    warnning_url='http://www.weather.com.cn/alarm/warninglist1.shtml'
    earthquake_url='https://www.cea.gov.cn/eportal/ui?struts.portlet.mode=view&struts.portlet.action=/portlet/expressEarthquake!queryExpressEarthquakeList.action&pageId=363409&moduleId=a852ba487b534470a84a30f00e7d6670'
    # 热点新闻 网址为凤凰网搜索 关键字 后的网址
    hotnews_url='http://so.ifeng.com/?q=%E4%B8%AD%E5%9B%BD%E8%81%94%E9%80%9A&c=1c'

    # 初始化谷歌浏览器
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    chrome = webdriver.Chrome(chrome_options=options)
    # 初始化对象
    eq = Earthquake()
    wr = WeatherReport()
    rain = RainOrder()
    hotnews = HotNews()
    warn = Warning()
    weather = Weather()
    weather_urls = weather.get_urls()
    # 创建定时器
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        filename='log1.txt',
                        filemode='a')
    task = Task()
    task.wr_task()
    task.weather_task()
    task.rain_task()
    task.hotnews_task()
    task.warn_task()
    task.earthquake_task()
    task.clean_data()
    scheduler = BlockingScheduler()
    scheduler.add_job(func=task.weather_task, trigger='interval', minutes=45)
    scheduler.add_job(func=task.wr_task, trigger='interval', hours=24)
    scheduler.add_job(func=task.rain_task, trigger='interval', hours=24)
    scheduler.add_job(func=task.warn_task, trigger='interval', hours=3)
    scheduler.add_job(func=task.hotnews_task, trigger='interval', hours=12)
    scheduler.add_job(func=task.earthquake_task, trigger='interval', hours=24)
    scheduler.add_job(func=task.clean_data, trigger='interval', hours=24)
    scheduler._logger=logging
    scheduler.add_listener(task.my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    scheduler.start()
    # #关闭数据库
    # db.close_db()
    # # 关闭浏览器
    # chrome.close()


数据库表和天气配置表可以看https://blog.csdn.net/qq_48663998/article/details/118047946,谢谢大家,点个赞吧。

  • 7
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值