爬虫笔记1--爬取墨迹天气

爬虫笔记1--爬取墨迹天气

 

最近由于需要写了一个简单的墨迹天气爬取脚本,主要功能为爬取墨迹天气,然后将其存到MySQL数据库中。

 

1、功能

本代码主要功能为:爬取墨迹天气数据,将数据保存到MySQL数据库。其中数据库的建立脚本和数据的插入脚本在第二部分代码块中,此处不赘述。此处简单说明一下如何使用 requests_html 库爬取数据。

requests_html是18年出的一个Python库,非常适用于新手爬取数据,以下以墨迹天气为例子加以说明。

此处爬取墨迹天气中 空气质量综合评价(良)和CO指数(5),其代码如下所示:

# -*- coding:utf-8 -*-
from requests_html import HTMLSession

def GetAqi(city):
    session = HTMLSession()
    url = 'https://tianqi.moji.com/aqi/china/guangdong/' + city
    r = session.get(url)
    seldesc = '#aqi_desc'
    selco = '#aqi_info > div.aqi_info_item > ul > li:nth-child(6) > span'
    retdesc = r.html.find(seldesc)
    retco = r.html.find(selco)
    dictaqi = {"空气质量综合评价":retdesc[0].text, "CO指数":retco[0].text}
    return dictaqi

if __name__ == '__main__':
    city = 'pingshan-district'
    print(GetAqi(city))

爬取结果如下:

需要注意的是如何获取特定字段在html中的路径,此处直接在Chrome浏览器中通过如下方法获取:Inspect->Elements->选中‘“良”对应最下层标签->Copy->Copy selector,将拷贝的内容粘贴到上述代码中seldesc中,然后同样将CO的路径粘贴到上述代码中selco中即可, 具体操作如下图所示:

 

2、代码

MySQL数据库脚本:

CREATE SCHEMA `weather`;

#weather (id,cid,Fevn,Ftemp,Fcondition,Fhumidity,Fupdatetime,Fwind,ts)
CREATE TABLE `weather`.`weather` (
  `id` INT NOT NULL,
  `cid` INT NOT NULL,
  `Fevn` VARCHAR(20) NOT NULL,
  `Ftemp` TINYINT(1) NOT NULL,
  `Fcond` VARCHAR(20) NOT NULL,
  `Fhumi` TINYINT(1) NOT NULL,
  `Futime` DATETIME NULL,
  `Fwind` VARCHAR(20) NOT NULL,
  `ts` TIMESTAMP NULL,
  PRIMARY KEY (`id`))
COMMENT = '该表存放id,城市id,环境综合指数,温度,天气状况,湿度,天气更新时间,风速,写入数据时间戳';


#city (id,Sname,Lname)
CREATE TABLE `weather`.`city` (
  `id` INT NOT NULL,
  `Sname` VARCHAR(50) NOT NULL,
  `Lname` VARCHAR(200) NOT NULL,
  PRIMARY KEY (`id`))
 COMMENT = '城市id,城市名称缩写,城市名称绝对地址';

#'1', 'pingshan-district', '中国广东省深圳市坪山区'

#aqi (id,cid,val,desc,pm10,pm2.5,no2,so2,o3,co,ts)
CREATE TABLE `weather`.`aqi` (
  `id` INT NOT NULL,
  `cid` INT NOT NULL,
  `val` TINYINT(1) NOT NULL,
  `desc` VARCHAR(10) NOT NULL,
  `pm10` TINYINT(1) NOT NULL,
  `pm25` TINYINT(1) NOT NULL,
  `no2` TINYINT(1) NOT NULL,
  `so2` TINYINT(1) NOT NULL,
  `o3` TINYINT(1) NOT NULL,
  `co` TINYINT(1) NOT NULL,
  `ts` TIMESTAMP NOT NULL,
  PRIMARY KEY (`id`))
COMMENT = '该表存放天气综合指数,id,城市id,指数值,指数描述,pm10,pm2.5,NO2,SO2,O3,CO';

Python脚本:

# -*- coding:utf-8 -*-
import time,datetime
from requests_html import HTMLSession
import pymysql
import traceback

# Notice: requests_html is only supported by python3.6
# https://blog.csdn.net/nkwshuyi/article/details/79435248
# https://github.com/kennethreitz/requests-html

class MysqlClass():
    db = None
    host = 'localhost'
    usr = 'root'
    pwd = 'YourPwd'
    dbname = 'weather'
    port = 3306
    charset = 'utf8'

    def ShowVersion(self):
        db = pymysql.connect(self.host, self.usr, self.pwd, self.dbname, self.port)
        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()
        # 使用 execute()  方法执行 SQL 查询
        cursor.execute("SELECT VERSION()")
        # 使用 fetchone() 方法获取单条数据.
        data = cursor.fetchone()
        print("Database version : %s " % data)
        # 关闭数据库连接
        db.close()

    def OpenDB(self):
        '''
        打开mysql:
        '''
        self.db = pymysql.connect(host = self.host, user = self.usr, passwd = self.pwd, db = self.dbname,charset = self.charset)
        #print('Open MySQL!')

    def CloseDB(self):
        """
        关闭sql
        """
        self.db.close()
        #print('Close MySQL!')

    def ExcuteSQL(self,str_sql):
        self.OpenDB()
        try:
            cursor = self.db.cursor()
            cursor.execute(str_sql)
            cursor.close()
            self.db.commit()
        except:
            self.db.rollback()
            traceback.print_exc()
        self.CloseDB()

    def GetMaxId(self,tableName):
        sql_1 = "select max(id) from "+tableName
        maxnum = 0
        try:
            cursor = self.db.cursor()
            cursor.execute(sql_1)
            ret1 = cursor.fetchone()
            maxnum = ret1[0]#返回为tupple
            cursor.close()
        except :
            self.db.rollback()
            traceback.print_exc()
        return maxnum

    def GetCidBySname(self, city):
        self.OpenDB()
        cid = 1
        sql_1 = "select id from city where Sname= \'%s\'"%(city)
        try:
            cursor = self.db.cursor()
            cursor.execute(sql_1)
            ret1 = cursor.fetchone()
            cid = ret1[0]#返回为tupple
            cursor.close()
        except :
            self.db.rollback()
            traceback.print_exc()
        self.CloseDB()
        return cid

    def Insert_City(self, data_dict):
        ''' 插入天气数据到weather表中 '''
        self.OpenDB()
        num = self.GetMaxId('city')
        if(num==None):
            num = 1
        else:
            num = num+1
        #查询数据是否重复
        if (num > 0):
            cursor = self.db.cursor()
            sql_1 = 'select * from city where Sname=\'%s\' '% (data_dict['Sname'])
            cursor.execute(sql_1)
            ret1 = cursor.fetchall()
            cursor.close()
            if (len(ret1) > 0):
                exit(data_dict['Sname']+' is here!')
        #插入数据
        sql_2 = "INSERT INTO city(id,Sname,Lname) \
                VALUES (%d,\'%s\',\'%s\')"%(num,data_dict['Sname'],data_dict['Lname'])
        try:
            # 执行sql语句
            cursor = self.db.cursor()
            cursor.execute(sql_2)
            cursor.close()
            # 提交到数据库执行
            self.db.commit()
        except:
            # 发生错误时回滚
            print('error',data_dict)
            self.db.rollback()
            traceback.print_exc()
        self.CloseDB()

    def Insert_Weather(self, cid, data_dict):
        ''' 插入天气数据到weather表中 '''
        self.OpenDB()
        num = self.GetMaxId('weather')
        if(num==None):
            num = 1
        else:
            num = num+1
        #插入数据
        ts_str = time.strftime('%Y-%m-%d %H:%M:%S')
        sql_1 = "INSERT INTO weather(id,cid,Fevn,Ftemp,Fcond,Fhumi,Futime,Fwind,ts) \
                VALUES (%d,%d,\'%s\',%d,\'%s\',%d,\'%s\',\'%s\',\'%s\')"%(num,cid,data_dict['Fevn'],data_dict['Ftemp'],data_dict['Fcond'],data_dict['Fhumi'],data_dict['Futime'],data_dict['Fwind'],ts_str)
        try:
            # 执行sql语句
            cursor = self.db.cursor()
            cursor.execute(sql_1)
            cursor.close()
            # 提交到数据库执行
            self.db.commit()
        except:
            # 发生错误时回滚
            print('error',data_dict)
            self.db.rollback()
            traceback.print_exc()
        self.CloseDB()

    def Insert_Aqi(self, cid, data_dict):
        ''' 插入天气数据到aqi表中 '''
        self.OpenDB()
        num = self.GetMaxId('aqi')
        if(num==None):
            num = 1
        else:
            num = num + 1
        #插入数据
        ts_str = time.strftime('%Y-%m-%d %H:%M:%S')
        # Notice: desc为关键字,需要在其左右加~(小键盘1(!))左边字符,否则会出错
        sql_1 = "INSERT INTO aqi(id,cid,val,`desc`,pm10,pm25,no2,so2,o3,co,ts) \
                VALUES (%d,%d,%d,\'%s\',%d,%d,%d,%d,%d,%d,\'%s\')"%(num,cid,data_dict['val'],data_dict['desc'],data_dict['pm10'],data_dict['pm25'],data_dict['no2'],data_dict['so2'],data_dict['o3'],data_dict['co'],ts_str)
        try:
            # 执行sql语句
            cursor = self.db.cursor()
            cursor.execute(sql_1)
            cursor.close()
            # 提交到数据库执行
            self.db.commit()
        except:
            # 发生错误时回滚
            print('error',data_dict)
            self.db.rollback()
            traceback.print_exc()
        self.CloseDB()

def InsertCity(Sname,Lname):
    sql = MysqlClass()
    dict_city = {'Sname':Sname,'Lname':Lname}
    sql.Insert_City(dict_city)

def GetWeather(city):
    session = HTMLSession()
    url = 'https://tianqi.moji.com/weather/china/guangdong/'+city
    r = session.get(url)
    #print(r.html.text) #输出网页内容
    selevn = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_alert.clearfix > ul > li > a > em'
    seltemp = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_weather.clearfix > em'
    selwea = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_weather.clearfix > b'
    seltime = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_weather.clearfix > strong'
    selhumidy = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_about.clearfix > span'
    selwind = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_about.clearfix > em'
    retevn = r.html.find(selevn)
    rettemp = r.html.find(seltemp)
    retwea = r.html.find(selwea)
    rettime = r.html.find(seltime)
    rethumidy = r.html.find(selhumidy)
    retwind = r.html.find(selwind)
    listweather = [retevn[0].text,rettemp[0].text,retwea[0].text,rettime[0].text,rethumidy[0].text,retwind[0].text]
    return listweather

def GetAqi(city):
    session = HTMLSession()
    url = 'https://tianqi.moji.com/aqi/china/guangdong/' + city
    r = session.get(url)
    selval = '#aqi_value'
    seldesc = '#aqi_desc'
    selpm10 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(1) > span'
    selpm25 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(2) > span'
    selno2 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(3) > span'
    selso2 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(4) > span'
    selo3 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(5) > span'
    selco = '#aqi_info > div.aqi_info_item > ul > li:nth-child(6) > span'
    retval = r.html.find(selval)
    retdesc = r.html.find(seldesc)
    retpm10 = r.html.find(selpm10)
    retpm25 = r.html.find(selpm25)
    retno2 = r.html.find(selno2)
    retso2 = r.html.find(selso2)
    reto3 = r.html.find(selo3)
    retco = r.html.find(selco)
    listaqi = [retval[0].text, retdesc[0].text, retpm10[0].text, retpm25[0].text, retno2[0].text, retso2[0].text, reto3[0].text, retco[0].text]
    return listaqi

def SaveWeatherInfo(city):
    print('update WeatherInfo per 30min')
    listWeather = GetWeather(city) # ['61 良', '25', '晴', '今天21:23更新', '湿度 75%', '南风2级']  update per 30min
    # (num,cid,data_dict['Fevn'],data_dict['Ftemp'],data_dict['Fcond'],data_dict['Fhumi'],data_dict['Futime'],data_dict['Fwind'],ts_int)
    strTime= time.strftime('%Y-%m-%d ')+listWeather[3][2:len(listWeather[3])-2]+':00'
    listHumi = listWeather[4].split(' ')
    strHumi = listHumi[1][0:len(listHumi[1])-1]
    dictWeather = {'Fevn':listWeather[0],'Ftemp':int(listWeather[1]),'Fcond':listWeather[2],'Fhumi':int(strHumi),'Futime':strTime,'Fwind':listWeather[5]}

    listAqi = GetAqi(city) # ['61', '良', '61', '55', '12', '3', '42', '6'] update per 1hour
    # (num,cid,data_dict['val'],data_dict['desc'],data_dict['pm10'],data_dict['pm25'],data_dict['no2'],data_dict['so2'],data_dict['o3'],data_dict['co'],ts_int)
    dictAqi = {'val':int(listAqi[0]),'desc':str(listAqi[1]),'pm10':int(listAqi[2]),'pm25':int(listAqi[3]),'no2':int(listAqi[4]),'so2':int(listAqi[5]),'o3':int(listAqi[6]),'co':int(listAqi[7])}
    sql = MysqlClass()
    cid = sql.GetCidBySname(city)
    sql.Insert_Weather(cid,dictWeather)
    sql.Insert_Aqi(cid, dictAqi)

def GetTimestamp():
    ''' get timestamp '''
    ts = 0
    dt = (datetime.datetime.now()).strftime('%Y-%m-%d %H:%M:%S')
    timeArray = time.strptime(dt, '%Y-%m-%d %H:%M:%S')
    ts = time.mktime(timeArray)
    return int(ts)

if __name__ == '__main__':
    city = 'pingshan-district'
    #InsertCity(city,"中国广东省深圳市坪山区") #若没有插入该数据,则需要在此处插入或者手动插入
    strT1 = time.strftime('%Y-%m-%d %H:%M:')+'00'
    while((strT1[14:16] != '00') and (strT1,strT1[14:16] != '30')): #初始时间为xx:00:00或者xx:30:00
        time.sleep(30)
        strT1 = time.strftime('%Y-%m-%d %H:%M:%S')
    ts1 = time.mktime(time.strptime(strT1,'%Y-%m-%d %H:%M:%S'))
    while(True):
        SaveWeatherInfo(city)
        while(GetTimestamp()<(ts1+1800)):#墨迹约每半小时更新一次数据,即1800s保存一次数据即可
            time.sleep(20)
        ts1 = ts1 + 1800

KeepAlive脚本:(防止程序异常挂掉)

#!/bin/sh

#检测程序(PrintMoJi.py)是否运行,未运行则启动该程序

i=1
while [ i=1 ]
do
cd /home/xg/code/PowerPredict/ 
echo "I'm KeepAlive!"
ps -ef >./process
var1= grep PrintMoJi.py process
#echo $var1
#declare -i a=$?
if [ $? -eq 0 ];
then 
echo "MoJiWeather is running!"
else 
echo "MoJiWeather is dead!"
python3.6 PrintMoJi.py &
fi
rm ./process
sleep 30
done

将该shell脚本添加到Linux开机启动项中,设置为后台运行就可以达到防止程序异常挂掉的目的了,具体操作见:Linux下防止程序挂掉的shell脚本

 

3、说明

本代码当前测试环境为python3.6.3,MySQL 5.7.13

参考文献:

如何用Python爬数据?(一)网页抓取 :https://blog.csdn.net/nkwshuyi/article/details/79435248
requests-html GitHub网址:https://github.com/kennethreitz/requests-html

注:requests-html当前只支持python3.6.x

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

昕光xg

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值