环境:
mysql+python3.5+pycharm
sql脚本:
CREATE TABLE `AQIHH` (
`CITYCODE` VARCHAR(12) NOT NULL,
`DATATIME` DATETIME NOT NULL,
`AQI` VARCHAR(20) DEFAULT NULL,
`PM25` VARCHAR(20) DEFAULT NULL,
`PM10` VARCHAR(20) DEFAULT NULL,
`SO2` VARCHAR(20) DEFAULT NULL,
`NO2` VARCHAR(20) DEFAULT NULL,
`CO` VARCHAR(20) DEFAULT NULL,
`O3` VARCHAR(20) DEFAULT NULL,
`UPDATETIME` DATETIME DEFAULT NULL,
`FS` VARCHAR(20) DEFAULT NULL,
`XDSD` VARCHAR(20) DEFAULT NULL,
`JSL` VARCHAR(20) DEFAULT NULL,
`FX` VARCHAR(20) DEFAULT NULL,
PRIMARY KEY (`CODE`,`DATATIME`)
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `t_ods_wea_city` (
`citycode` VARCHAR(20) DEFAULT NULL,
`cityname` VARCHAR(50) DEFAULT NULL
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;
-- 这里只添加了2个城市的
INSERT INTO t_ods_wea_city(citycode,cityname) VALUES('101010100','北京'),('101020100','上海');
直接上详细代码把:
# encoding=utf-8
# import requests 这是python2里的
import json
import time
# from urllib.request import urlopen 另一种用法
import urllib.request
import urllib
import datetime
import random
import socket
import http
# 正则
import re
# db
import pymysql
db = pymysql.connect(host="192.168.xx.xx", port=3306, user="root", passwd='123456', db="r_db", charset="utf8")
def requests(citycode):
#citycode = 101280601
# 请求地址http://d1.weather.com.cn/aqi_all/101280601.html?_=1576479057467,后边的一串数字是时间戳可以随机或者确定的不影响,这里用time.time()
# unixtime-> format time >>> #转换成localtime -> time_local = time.localtime(time_now) #转换成新的时间格式(2016-05-09 18:59:20) dt = time.strftime("%Y-%m-%d %H:%M:%S",time_local)
time_now = str(int(time.time()))
url = "http://d1.weather.com.cn/aqi_all/" + str(citycode) + ".html?_="+time_now
# 请求头,不加获取不到数据
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
"Referer": "http://www.weather.com.cn/air/?city=101280601",
}
# 超时,取随机数是因为防止被网站认定为网络爬虫
timeout = random.choice(range(80, 180))
# 请求对象
req = urllib.request.Request(url, headers=headers)
# 这里使用request对象打开
# res = urllib.request.urlopen(req,timeout=timeout).read()
#res = urllib.request.urlopen(req,timeout=0.001).read()
#timeout = 0.001
while True:
try:
# 获取请求数据
res = urllib.request.urlopen(req, timeout=timeout).read()
break
except socket.timeout as e:
print('error_2:', e)
# test timeout case
# timeout = random.choice(range(80, 180))
time.sleep(random.choice(range(8, 15)))
except socket.error as e:
print('error_3:', e)
# timeout = random.choice(range(80, 180))
time.sleep(random.choice(range(20, 60)))
# except http.client.BadStatusLine as e:
# print('5:', e)
# time.sleep(random.choice(range(30, 80)))
# except http.client.BadStatusLine as e:
# print('6:', e)
# time.sleep(random.choice(range(5, 15)))
return res
def parse(res):
# p1 = re.compile(r'[(](.*?)[)]', re.S) # 最小匹配
# 使用正则取出第一个方括号中的内容
p2 = re.compile(r'[[](.*)[]]', re.S) # 贪婪匹配
json_str = re.findall(p2, res.decode('utf-8'))
# json反序列化(字符串转成字典)方法 loads:无文件操作 load: 读文件+反序列化
# json序列化(字典转成字符串)方法: dumps:无文件操作 dump:序列化+写入文件
try:
json_str1 = json.loads('[' + json_str[0] + ']')
except Exception as e:
print (e)
return None
return json_str1
def save(list,citycode):
# 计数 100条提交一次 cur_cnt 当前计数
commit_cnt = 10
cur_cnt = 0
count = len(list)
# 作用 1,过滤指定内容,2 映射为想要的字串
subkey = {'time': 'HH24', 't1': 'AQI', 't3': 'PM25', 't4': 'PM10', 't5': 'CO', 't6': 'NO2', 't7': 'O3', 't9': 'SO2',
't11': 'FS', 't12': 'XDSD', 't13': 'JSL', 't14': 'FX'} # 最后分别是风速,相对湿度,降水量,风向
# 使用cursor()方法获取游标对象
cursor = db.cursor()
# 判断是否是今天
vtatusv = 1
# 当天昨天
nday = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
yday = datetime.datetime.now().strftime('%Y-%m-%d')
# 批量插入的list
dv_list = []
# 插入的sql
sql = "replace into AQIHH(AREACODE,DATATIME,AQI,PM25,PM10,SO2,NO2,CO,O3,FS,XDSD,JSL,FX)" \
" values(%s,%s,%s,%s,%s ,%s,%s,%s,%s,%s,%s,%s,%s)"
for dict in list:
# 这里用到了 列表推导式(for表达式)
subdict = {value: dict[key] for key, value in subkey.items()} # 提取原字典中部分键值对,并替换key为中文
hh = subdict['HH24']
# print(subdict.keys())
if subdict['HH24'] == '00':
vtatusv = 0
if vtatusv == 0:
datatime = yday + ' ' + hh + ':00:00'
else:
datatime = nday + ' ' + hh + ':00:00'
#print(subdict)
# print(datatime)
aqi_status = len(subdict['AQI'].strip())
#如果有无效的计数减去1
if aqi_status == 0:
count -= 1
# 判断如果有AQI信息则执行入库
# if len(subdict['AQI'].strip()) != 0:
if len(hh) !=0 :
try: # ,SO2,NO2,CO,O3,FS,XDSD,JSL,FX
# 计数 100条提交一次
cur_cnt += 1
if len(subdict['AQI'].strip()) != 0:
AQI = (subdict['AQI'])
else:
AQI = None
if len(subdict['PM25'].strip()) != 0:
PM25 = (subdict['PM25'])
else:
PM25 = None
if len(subdict['PM10'].strip()) != 0:
PM10 = (subdict['PM10'])
else:
PM10 = None
if len(subdict['SO2'].strip()) != 0:
SO2 = (subdict['SO2'])
# SO2 = int(subdict['SO2'])
else:
SO2 = None
if len(subdict['NO2'].strip()) != 0:
NO2 = (subdict['NO2'])
else:
NO2 = None
if len(subdict['CO'].strip()) != 0:
CO = (subdict['CO'])
else:
CO = None
if len(subdict['O3'].strip()) != 0:
O3 = (subdict['O3'])
else:
O3 = None
if len(subdict['FS'].strip()) != 0:
FS = subdict['FS']
else:
FS = None
if len(subdict['XDSD'].strip()) != 0:
XDSD = (subdict['XDSD'])
else:
XDSD = None
if len(subdict['JSL'].strip()) != 0:
JSL = (subdict['JSL'])
else:
JSL = None
if len(subdict['FX'].strip()) != 0:
FX = subdict['FX'].strip()
else:
FX = None
dv_list.append((citycode, datatime,AQI, PM25, PM10, SO2, NO2, CO, O3 ,FS,XDSD, JSL, FX))
if cur_cnt%commit_cnt == 0 or cur_cnt == count:
#val = [(citycode, datatime, int(subdict['AQI']), PM25, PM10, SO2, NO2, CO, O3 ,FS,None, None, None)]
cursor.executemany(sql, dv_list)
# 提交到数据库执行
db.commit()
# 清零技术
dv_list.clear()
# if datatime=='2019-12-19 09:00:00':
# cursor.execute(
# "replace into AQIHH(AREACODE,DATATIME,AQI,PM25,PM10,SO2,NO2,CO,O3,FS,XDSD,JSL,FX)"
# " values(%s,'%s',%d,%d,%d ,%d,%d,%f,%d,%s,%d,%f,'%s')" % (
# citycode, datatime, int(subdict['AQI']),PM25, PM10, SO2, NO2, CO, O3,FS,XDSD,JSL,FX))
except Exception as e:
# 如果发生错误
db.rollback()
print(e)
print('----------------------------------')
# 最后一批处理
try:
cursor.executemany(sql, dv_list)
# 提交到数据库执行
db.commit()
# 清零技术
dv_list.clear()
except Exception as e:
# 如果发生错误
db.rollback()
print(e)
if __name__ == '__main__':
citycode_list = []
r_cursor = db.cursor()
r_cursor.execute('SELECT citycode FROM t_ods_wea_city limit 1')
#rows = r_cursor.execute(None, {'ln': 9}) 101090401 101010100 and citycode="101090401"
rows = r_cursor.fetchall()
for r in rows:
citycode_list.append(r[0])
r_cursor.close()
#citycode_list = ['101280601','101010100']
for citycode in citycode_list:
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+' ########'+citycode+':start!!!#######')
# 获取数据
res = requests(citycode)
#解析
pas = parse(res)
# 入库
if pas != None:
save(pas, citycode)
else:
print(citycode+':解析失败')
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+' ########' + citycode + ':end!!!#######')
# 休息1s
#time.sleep(3.1)