采用python 的bs4和urllib从网站:http://www.tianqihoubao.com/lishi/beijing/month/201710.html,爬取了2011年1月到2017年11月13日天气数据,主要有以下四个字段:data(时间)、weather(天气)、temperature(温度)、wind(风力风向),四个字段的数据
'''
Created on 2017-11-13
@author: chen
'''
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
def get_all_weather_url():
response = urlopen("http://www.tianqihoubao.com/lishi/beijing/month/201101.html")
bs_obj = BeautifulSoup(response.read(), "lxml")
months = bs_obj.find("div", {"class": "months"})
month_all = months.find_all("a")
for month in month_all:
yield month.attrs
# url处理
def get_page_url_weather():
for url in get_all_weather_url():
helf_url = url["href"]
title = url["title"]
weather_url = "http://www.tianqihoubao.com/" + str(helf_url)
yield weather_url, title
def get_weather_data():
url_set = set([])
for url, title in get_page_url_weather():
if url not in url_set:
url_set.add(url)
weather_content = urlopen(url).read()
weather_page_obj = BeautifulSoup(weather_content, "lxml")
tbody_page = weather_page_obj.table
tr_weather_page = tbody_page.find_all("tr")
for tr_each in tr_weather_page:
# print(tr_each)
td_weather = tr_each.find_all("td")
data = td_weather[0].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
weather = td_weather[1].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
temperature = td_weather[2].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
wind = td_weather[3].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
yield data, weather, temperature, wind
else:
continue
def main():
with open(r"C:\Users\chen\Desktop\北京天气数据每日更新.csv", "w+", newline="") as file:
writer = csv.writer(file)
tem = 1
for data, weather, temperature, wind in get_weather_data():
day_weather = [data, weather, temperature, wind]
writer.writerow(day_weather)
print("第" + str(tem) + "次写入成功")
tem += 1
print("写入完毕")
if __name__ == '__main__':
main()
2、空气质量,包含以下字段的数据
data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3
'''
Created on 2017-11-14
@author: chen
'''
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
def get_all_weather_url():
response = urlopen("http://www.tianqihoubao.com/aqi/beijing-201708.html")
bs_obj = BeautifulSoup(response.read(), "lxml")
months = bs_obj.find("div", {"class": "box p"})
month_all = months.find_all("li")
for month in month_all:
yield month.a
# url处理
def get_page_url_weather():
for url in get_all_weather_url():
helf_url = url["href"]
weather_url = "http://www.tianqihoubao.com/" + str(helf_url)
yield weather_url
def get_weather_data():
url_set = set([])
for url in get_page_url_weather():
if url not in url_set:
url_set.add(url)
weather_content = urlopen(url).read()
weather_page_obj = BeautifulSoup(weather_content, "lxml")
tbody_page = weather_page_obj.table
tr_weather_page = tbody_page.find_all("tr")
for tr_each in tr_weather_page:
td_weather = tr_each.find_all("td")
print(len(td_weather), url)
data = td_weather[0].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
Quality = td_weather[1].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
AQI_data = td_weather[2].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
AQI_rank = td_weather[3].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
PM2_5 = td_weather[4].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
PM10 = td_weather[5].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
SO2 = td_weather[6].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
NO2 = td_weather[7].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
CO2 = td_weather[8].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
O3 = td_weather[9].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
yield data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3
else:
continue
def main():
with open(r"C:\Users\chen\Desktop\北京每日空气质量更新.csv", "w+", newline="") as file:
writer = csv.writer(file)
tem = 1
for data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3 in get_weather_data():
day_weather = [data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3]
writer.writerow(day_weather)
print("第" + str(tem) + "次写入成功")
tem += 1
print("写入完毕")
if __name__ == '__main__':
main()