爬虫笔记2--爬取2345网站历史天气
最近需要获取某些地区的历史气象信息,墨迹天气无法获取历史数据,就在网上看了下,发现2345网站有相对完善的历史气象信息,就爬了下来并保存到MySql数据中。
1、功能
本代码主要功能为:爬取2345天气历史数据,将数据保存到MySQL数据库。其中数据库的建立脚本和数据的插入脚本在第二部分代码块中,此处不赘述。后续若发现更好的气象数据,也会根据需要续更。
此处简要说明一下如何获取2345气象网站数据,根据主界面按钮,找到其对应的js数据url,根据其url获取其数据即可,如下图所示:
此处需要获取三个主要参数,第一个为url格式(即Request URL),第二个为地区id(对应sql中2345city.number,此处天河区为72025),第三个为月份格式(最近两年的为yyyymmdd,但是早期的不完全为该格式,因此实际代码中最好将其手动放到一个list中),后续代码都是根据这三个主要参数将js返回数据转为python的dict数据,然后存入数据库。
2345天气预报:http://tianqi.2345.com/
2、代码
MySQL数据库脚本:
#2345city(id,Sname,Lname,number)
CREATE TABLE `weather`.`2345city` (
`id` INT NOT NULL,
`Sname` VARCHAR(50) NOT NULL,
`Lname` VARCHAR(200) NOT NULL,
`number` INT NOT NULL,
PRIMARY KEY (`id`))
COMMENT = '该表记录2345网站城市相关信息,\n字段:城市id,城市名称缩写,城市名称绝对地址,2345城市号码';
#'1', '龙岗', '中国广东省深圳市龙岗', '72039'
#2345overall(id,cid,maxWdndu,minWendu,avgbWendu,avgyWendu,maxAqi,minAqi,avgAqi,maxAqiInfo,maxAqiDate,maxAqiLevel,minAqiInfo,minAqiDate,minAqiLevel)
CREATE TABLE `weather`.`2345overall` (
`id` INT NOT NULL,
`cid` INT NOT NULL,
`maxWendu` VARCHAR(30) NOT NULL,
`minWendu` VARCHAR(30) NOT NULL,
`avgbWendu` VARCHAR(5) NOT NULL,
`avgyWendu` VARCHAR(5) NOT NULL,
`maxAqi` VARCHAR(5) NOT NULL,
`minAqi` VARCHAR(5) NOT NULL,
`avgAqi` VARCHAR(5) NOT NULL,
`maxAqiInfo` VARCHAR(20) NOT NULL,
`maxAqiDate` VARCHAR(20) NOT NULL,
`maxAqiLevel` VARCHAR(5) NOT NULL,
`minAqiInfo` VARCHAR(20) NOT NULL,
`minAqiDate` VARCHAR(20) NOT NULL,
`minAqiLevel` VARCHAR(5) NOT NULL,
`ts` DATETIME NOT NULL,
PRIMARY KEY (`id`))
COMMENT = '该表记录2345网站特定城市月天气综合评价,\n字段:id,城市id,月最高温度,月最低温度,月平均温度,最大Aqi,最小Aqi,平均Aqi,最大Aqi评价,最大Aqi日期,最大Aqi级别,最小Aqi评价,最小Aqi日期,最小Aqi级别,数据插入时间';
#2345his(id,cid,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel,ts)
CREATE TABLE `weather`.`2345his` (
`id` INT NOT NULL,
`cid` INT NOT NULL,
`ymd` DATETIME NOT NULL,
`bWendu` VARCHAR(5) NOT NULL,
`yWendu` VARCHAR(5) NOT NULL,
`tianqi` VARCHAR(10) NOT NULL,
`fengxiang` VARCHAR(20) NOT NULL,
`fengli` VARCHAR(10) NOT NULL,
`aqi` VARCHAR(5) NOT NULL,
`aqiInfo` VARCHAR(20) NOT NULL,
`aqiLevel` VARCHAR(5) NOT NULL,
`ts` DATETIME NOT NULL,
PRIMARY KEY (`id`))
COMMENT = '该表记录2345网站历史数据,\n字段:id,城市id,日期,最高温度,最低温度,天气类型,风向,风力,空气综合指数值,空气综合指数评价,空气指数级别,数据插入时间';
Python爬取数据脚本:
#!/usr/bin/env python2.7
# -*- coding:utf-8 -*-
import urllib2 #python2.7
import json
import chardet
import pymysql
import time
import traceback
# to address unicode and utf-8,this code is only supported by python2.7
# pingshan, longgang, shenzheng, guangzhou, china
class MysqlClass():
db = None
host = 'localhost'
usr = 'root'
pwd = 'YourPwd'
dbname = 'weather'
port = 3306
charset = 'utf8'
def ShowVersion(self):
db = pymysql.connect(self.host, self.usr, self.pwd, self.dbname, self.port)
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用 execute() 方法执行 SQL 查询
cursor.execute("SELECT VERSION()")
# 使用 fetchone() 方法获取单条数据.
data = cursor.fetchone()
print("Database version : %s " % data)
# 关闭数据库连接
db.close()
def OpenDB(self):
'''
打开mysql:
'''
self.db = pymysql.connect(host = self.host, user = self.usr, passwd = self.pwd, db = self.dbname,charset = self.charset)
def CloseDB(self):
"""
关闭sql
"""
self.db.close()
def ExcuteSQL(self,str_sql):
self.OpenDB()
try:
cursor = self.db.cursor()
cursor.execute(str_sql)
cursor.close()
self.db.commit()
except:
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def GetMaxId(self,tableName):
sql_1 = "select max(id) from "+tableName
maxnum = 0
try:
cursor = self.db.cursor()
cursor.execute(sql_1)
ret1 = cursor.fetchone()
maxnum = ret1[0]#返回为tupple
cursor.close()
except :
self.db.rollback()
traceback.print_exc()
return maxnum
def Insert_2345City(self, data_dict):
''' 插入天气数据到weather表中 '''
self.OpenDB()
num = self.GetMaxId('2345city')
if(num==None):
num = 1
else:
num = num+1
#查询数据是否重复
if (num > 0):
cursor = self.db.cursor()
sql_1 = 'select * from 2345city where Sname=\'%s\' '% (data_dict['Sname'])
cursor.execute(sql_1)
ret1 = cursor.fetchall()
cursor.close()
if (len(ret1) > 0):
exit(data_dict['Sname']+' is here!')
#插入数据
sql_2 = "INSERT INTO 2345city(id,Sname,Lname,`number`) \
VALUES (%d,\'%s\',\'%s\',%d)"%(num,data_dict['Sname'],data_dict['Lname'],data_dict['number'])
try:
# 执行sql语句
cursor = self.db.cursor()
cursor.execute(sql_2)
cursor.close()
# 提交到数据库执行
self.db.commit()
except:
# 发生错误时回滚
print('error',data_dict)
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def Insert_2345his(self, cid, data_dict):
''' 插入天气数据到weather表中 '''
self.OpenDB()
num = self.GetMaxId('2345his')
if(num==None):
num = 1
else:
num = num+1
#插入数据
ts_str = time.strftime('%Y-%m-%d %H:%M:%S')
sql_1 = "INSERT INTO 2345his(id,cid,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel,ts) \
VALUES (%d,%d,\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"%(num,cid,data_dict['ymd'],data_dict['bWendu'],data_dict['yWendu'],data_dict['tianqi'],data_dict['fengxiang'],data_dict['fengli'],data_dict['aqi'],data_dict['aqiInfo'],data_dict['aqiLevel'],ts_str)
try:
# 执行sql语句
cursor = self.db.cursor()
cursor.execute(sql_1)
cursor.close()
# 提交到数据库执行
self.db.commit()
except:
# 发生错误时回滚
print('error',data_dict)
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def Insert_Overall(self, cid, data_dict):
''' 插入天气数据到aqi表中 '''
self.OpenDB()
num = self.GetMaxId('2345overall')
if(num==None):
num = 1
else:
num = num + 1
#插入数据
ts_str = time.strftime('%Y-%m-%d %H:%M:%S')
sql_1 = "INSERT INTO 2345overall(id,cid,maxWendu,minWendu,avgbWendu,avgyWendu,maxAqi,minAqi,avgAqi,maxAqiInfo,maxAqiDate,maxAqiLevel,minAqiInfo,minAqiDate,minAqiLevel,ts) \
VALUES (%d,%d,\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"%(num,cid,data_dict['maxWendu'],data_dict['minWendu'],data_dict['avgbWendu'],data_dict['avgyWendu'],data_dict['maxAqi'],data_dict['minAqi'],data_dict['avgAqi'],data_dict['maxAqiInfo'],data_dict['maxAqiDate'],data_dict['maxAqiLevel'],data_dict['minAqiInfo'],data_dict['minAqiDate'],data_dict['minAqiLevel'],ts_str)
try:
# 执行sql语句
cursor = self.db.cursor()
cursor.execute(sql_1)
cursor.close()
# 提交到数据库执行
self.db.commit()
except:
# 发生错误时回滚
print('error',data_dict)
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def Print2345(url):
request=urllib2.Request(url)
RES=urllib2.urlopen(request).read()
if isinstance(RES, unicode):
RES=RES.encode('utf-8')
else:
RES=RES.decode('gb2312').encode('utf-8')
return RES[16:len(RES)-1]
def IsChar(ch):
if((ch>='a' and ch<='z') or (ch>='A' and ch<='Z')):
return True
else:
return False
def JSstr2StrJson(JSstr=''):
ret = ''
for i in range (0,len(JSstr)):
if(i==0):
ret = ret + JSstr[i]
elif(JSstr[i]=='\'' or JSstr[i]=='\"'):
ret = ret + '\"'
elif((JSstr[i-1]=='{') and (IsChar(JSstr[i]))):
ret = ret +'\"'
ret = ret + JSstr[i]
elif ((JSstr[i - 1] == ',') and (IsChar(JSstr[i]))):
ret = ret + '\"'
ret = ret + JSstr[i]
elif(IsChar(JSstr[i-1]) and (JSstr[i]==':')):
ret = ret+'\"'
ret = ret + JSstr[i]
else:
ret = ret+JSstr[i]
return ret
def JsonToDict(strJson):
#strJson = '{"obj1":1,"obj2":2,"arr1":[1,2]}'
return json.loads(strJson) #loads --json file 2 dict
def DictToJson(objDict):
#objDict = {'obj1': 1, 'obj2': 2, 'arr1': [1, 2]}
return json.dumps(objDict)
def GetWeatherByMonth(citynum,month):
url = 'http://tianqi.2345.com/t/wea_history/js/'+str(month)+'/'+str(citynum)+'_'+str(month)+'.js'
strJSJson = Print2345(url)
strJson = JSstr2StrJson(strJSJson)
#print strJson
objDict = JsonToDict(strJson)
return objDict
def SaveCity(Sname,Lname,num):
sql = MysqlClass()
dict_city = {'Sname':Sname,'Lname':Lname,'number':num}
sql.Insert_2345City(dict_city)
def SaveOverAll(cid,dictdata):
print 'Save overall to sql'
sql = MysqlClass()
sql.Insert_Overall(cid,dictdata)
def SaveHis(cid,dictdata):
print 'Save weather to sql'
sql = MysqlClass()
sql.Insert_2345his(cid,dictdata)
if __name__ == "__main__":
#SaveCity('天河','中国广东省广州市天河区',72025)
listMonth = [201704, 201705, 201706, 201707, 201708, 201709, 201710, 201711, 201712, 201801, 201802, 201803, 201804,
201805, 201806, 201807, 201808, 201809]
cityInfo = [[1,72039],[2,72025],[3,57494],[4,59287],[5,57687]] #(武汉,广州,长沙) #[[1,72039],[2,72025]] (龙岗,天河)
cid = cityInfo[1][0] #武汉
cnumber = cityInfo[1][1] #
for i in listMonth:
objDict = GetWeatherByMonth(cnumber,i)
objDictOverAll = {'maxWendu':objDict['maxWendu'],'minWendu':objDict['minWendu'],'avgbWendu':objDict['avgbWendu'],'avgyWendu':objDict['avgyWendu'],'maxAqi':objDict['maxAqi'],'minAqi':objDict['minAqi'],'avgAqi':objDict['avgAqi'],\
'maxAqiInfo':objDict['maxAqiInfo'],'maxAqiDate':objDict['maxAqiDate'],'maxAqiLevel':objDict['maxAqiLevel'],'minAqiInfo':objDict['minAqiInfo'],'minAqiDate':objDict['minAqiDate'],'minAqiLevel':objDict['minAqiLevel']}
objListWea = objDict['tqInfo']
SaveOverAll(cid,objDictOverAll)
for j in objListWea:
if(len(j)!=0):
SaveHis(cid,j)
3、说明
本代码当前测试环境为python2.7.13,MySQL 5.7.13;
若使用的是Linux服务器,需要将数据库表中包含中文的列字符集设置为utf-8,否则会出现插入数据失败的问题。