import re
import csv
import numpy
import codecs
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_table_head(url):
""" 该函数用于获取带表头的数据 """
html_bj = urlopen(url)
bsp_bj = BeautifulSoup(html_bj, "lxml") # 获取BeautifulSoup对象
table_list = bsp_bj.findAll("tr") # 获取所有的表格
data_list = []
table_head = table_list[0].get_text().strip("\n").split("\n\n")
data_list.append(table_head) # 获取表头
return data_list
def get_data(url):
""" 该函数用于获取不带表头的数据 """
html_bj = urlopen(url)
bsp_bj = BeautifulSoup(html_bj, "lxml")
table_list = bsp_bj.findAll("tr")
data_lists = []
for data_list in table_list[1:]:
data = data_list.get_text().replace(" ", "").replace("\n\r", ""). \
strip("\n").split("\n")
data_lists.append(data)
return data_lists
# 兰州空气质量指数(AQI)-PM2.5查询地址:
start_url = "http://www.tianqihoubao.com/aqi/chongqing.html"
html = urlopen(start_url)
bso_bj = BeautifulSoup(html, "lxml") # 获取BeautifulSoup对象
# 找到所有存放月度数据的网页链接,并以列表的形式按月份先后顺序保存这些链接
Sites = []
for link in bso_bj.findAll(href=re.compile("^(/aqi/chongqing-)")):
site = "http://www.tianqihoubao.com" + link.attrs['href']
Sites.append(site)
Sites.reverse()
Data_set = get_table_head(Sites[0]) # 获取表头
for url in Sites:
data_set = get_data(url) # 获取所有月度数据
for data in data_set:
del data[3]
Data_set = numpy.row_stack((Data_set, data_set)) # 拼接表头和数据
with open("Data.csv", "w+") as csv_file:
csv_file.write(codecs.BOM_UTF8.decode())
writer = csv.writer(csv_file)
for i in range(numpy.shape(Data_set)[0]):
writer.writerow((Data_set[i, :])) # 将数据逐行写入csv文件