对全国空气质量的相关指数(AQI指数、空气质量状况、PM10、PM2.5、一氧化碳、二氧化氮、臭氧等参数)进行爬取,生成以城市名为文件名的系列csv文件。
所爬取的网站地址如下:
所用到的所有库
import requests
from bs4 import BeautifulSoup
import time
import csv
function1.以二维列表形式-返回每个城市的城市名称与url
def get_city():
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
res = requests.get("http://www.tianqihoubao.com/aqi/",headers=headers)
res.encoding = "GBK"
soup = BeautifulSoup(res.text,'html.parser')
tables = soup.find_all(class_="citychk")
city_url = []
for table in tables:
citys_url = table.select("a")
for c in citys_url:
city_urls = c.attrs.get("href")
city_name = c.text.strip()
city_url.append([city_name, "http://www.tianqihoubao.com"+city_urls])
return city_url
function2.获取每个城市的历史时间列表 需要城市参数cityurl for city_url --2016年12月至2019年12
def get_history(CityUrl):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
res = requests.get(CityUrl,headers=headers)
res.encoding = "gbk"
soup = BeautifulSoup(res.text,'html.parser')
tables = soup.find_all(class_="box p")
citys_time = []
for table in tables:
data_times = table.select("a")
for data_time in data_times:
url = data_time.get("href")
Time = data_time.text.replace("\r\n", "").strip()
ts= time.strptime(Time, "%Y年%m月")
starTimeChuo = time.mktime(ts)
if starTimeChuo>1477929600.0 and starTimeChuo<1577808000.0:
citys_time.append([Time, "http://www.tianqihoubao.com/"+url])
return citys_time
function3.时间戳转换
这里是爬取了2016年12月至2019年12月份的数据
def transTime():
import time
# 获取开始时间(下限)时间戳
starTime = "2017-1"
starTS= time.strptime(starTime, "%Y-%m")
starTimeChuo = time.mktime(starTS)
# 获取截止时间(上限)时间戳
endTime = "2020-01"
endTS = time.strptime(endTime, "%Y-%m")
endTimeChuo = time.mktime(endTS)
function4. 获取每个城市每个月份的记录,无表头,返回的ulist为二维列表;需传入历史时间的url
def get_record(CityRecord):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
res = requests.get(CityRecord,headers=headers)
res.encoding = "gbk"
soup = BeautifulSoup(res.text,'html.parser')
trs = soup.find_all('tr')
ulist=[]
for tr in trs[1:]:
ui = []
for td in tr:
ui.append(td.string.strip())
ulist.append(ui)
for i in ulist:
while '' in i:
i.remove('')
return ulist
主函数部分,程序运行
citys = get_city()
for city in citys[:]: #城市-- 0-374
clist = []
cityHistorys = get_history(city[1])
print("当前正在写入的城市:"+city[0],"有"+str(len(cityHistorys))+"个月份")
for cityHistory in cityHistorys[:]: #每个城市的月份--0-37
print("当前正在写入的月份:"+cityHistory[0])
print(cityHistory)
cityRecord = get_record(cityHistory[1])
with open(city[0]+'.csv', 'a', newline="",encoding="utf-8") as f:
writer = csv.writer(f)
# writer.writerow(["date", 'quality', 'AQI', 'AQI_rank', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3'])
writer.writerows(cityRecord)