1.概述
利用爬虫、仿真、官方API等手段,收集不同城市、不同区域的气象信息,并在地图上进行动态展示。
从中国天气网站中(www.weather.com.cn),收集不同城市或区域的气象信息(温度、湿度、PM2.5等);
对不同区域的气候进行分析处理,并生成相关热力图;
以动态可视化方式,在地图中显示气候的相关信息,以及未来一段时间的变化趋势。
2.数据爬取
数据来源网站:天气网(www.weather.com.cn)
爬取对象:全国各区县
爬取内容:今日天气的实时温度、相对湿度、空气质量指数,本月每日历史天气数据及预测数据
附上我的爬虫代码:
import requests
from lxml import etree
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import datetime
ProvinceName = [] # 省的名字
CityName = [] # 城市的名字
CountyName = [] # 区县的名字
CountyHref = [] # 区县的链接
TodayUrl = [] # 今日天气的链接
FortyUrl = [] # 四十天天气的链接
nowTodeyTemp = [] # 实时温度
RHToday = [] # 相对湿度
AQIToday = [] # 空气质量指数
weatherWeek = [] # 天气情况
MaxTempWeek = [] # 最高气温
MinTempWeek = [] # 最低气温
ProvinceNameForty = [] # 省的名字
CityNameForty = [] # 城市的名字
CountyNameForty = [] # 区县的名字
dateForty = [] # 日期
MaxTempHistory = [] # 历史均值最高气温
MinTempHistory = [] # 历史均值最低气温
PreProHistory = [] # 历史均值降水概率
MaxTempForty = [] # 最高气温
MinTempForty = [] # 最低气温
WeatherForty = [] # 天气状况
# 开启驱动
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 设置为无头模式,就不会显示浏览器页面了
driver = webdriver.Chrome(options=options)
# 获取文本内容
def getHTMLText(url):
r = requests.get(url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'})
r.encoding = 'utf-8'
return r.text
# 获取全国各省的链接
def getProvinceUrl(r):
s = etree.HTML(r)
href = s.xpath('/html/body/div[4]/div[1]//a//@href')
return href
# 获取全国各区县的链接
def getCountyUrl(r):
s = etree.HTML(r)
href = s.xpath('/html/body/div[4]/div[2]/div/div/div[2]/div[1]/div/table/tr/td/a//@href')
return href
# 获取各区县今日、四十天天气链接
def getWeatherUrl(r):
s = etree.HTML(r)
# 获取各省、城市、区县的名字
ProvinceName.extend(s.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a[2]/text()'))
cityname = s.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a[3]/text()')
if(cityname):
CityName.extend(cityname)
CountyName.extend(s.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/span[4]/text()'))
else:
CityName.append('-')
CountyName.extend(s.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/span[3]/text()'))
# 获取今日天气链接
TodayUrl.extend(s.xpath('//*[@id="someDayNav"]/li[1]/a//@href'))
# 获取四十天天气链接
FortyUrl.extend(s.xpath('//*[@id="someDayNav"]/li[4]/a/@href'))
# 获取今日天气的信息
def getTodayMessage(url):
driver.get(url)
content = driver.page_source
html = BeautifulSoup(content, "html.parser")
# 获取实时温度
tem = html.find_all("div", class_="tem")
try:
result = tem[0].span.text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
nowTodeyTemp.append(result)
# 获取相对湿度
tem = html.find_all("div", class_="zs h")
try:
result = tem[0].em.text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
RHToday.append(result)
# 获取空气质量指数
tem = html.find_all("div", class_="zs pol")
try:
result = tem[0].span.text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
AQIToday.append(result)
# # 获取七天天气的信息
# def getWeekMessage(url):
# driver.get(url)
# content = driver.page_source
#
# html = BeautifulSoup(content, "html.parser")
#
# tem = html.find_all("ul", class_="t clearfix")
# li_tags = tem[0].find_all("li")
# for i in li_tags:
# p = i.find_all("p")
# # 获取天气状况
# try:
# result = p[0].text
# except AttributeError:
# result = 'unknown'
# except IndexError:
# result = 'unknown'
# weatherWeek.append(result)
# # 获取最高温度
# try:
# result = p[1].span.text
# except AttributeError:
# result = 'unknown'
# except IndexError:
# result = 'unknown'
# MaxTempWeek.append(result)
# # 获取最低温度
# try:
# result = p[1].i.text
# except AttributeError:
# result = 'unknown'
# except IndexError:
# result = 'unknown'
# MinTempWeek.append(result)
# 获取四十天天气的信息
def getFortyMessage(url):
driver.get(url)
content = driver.page_source
html = BeautifulSoup(content, "html.parser")
tem = html.find_all("div", class_="W_left")
td_tags = tem[1].find_all("td")
print(len(td_tags))
# 根据网页中的内容进行实时修改
i = 4
z = i
while(i<34):
# 获取日期
span_tags = td_tags[i].find_all("span")
try:
result = span_tags[1].text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
dateForty.append(result)
tem = td_tags[i].find_all("div", class_="w_xian")
span_tags = tem[0].find_all("span")
# 获取历史均值最高温度
try:
result = span_tags[0].text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
MaxTempHistory.append(result)
# 获取历史均值最低温度
try:
result = span_tags[1].text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
MinTempHistory.append(result)
# 获取历史均值降水概率
try:
result = span_tags[2].text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
PreProHistory.append(result)
if(i-z>=2):
# 获取最高温度
span_tags = td_tags[i].find_all("span", class_="w_day")
try:
result = span_tags[0].i.text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
MaxTempForty.append(result)
# 获取最低温度
span_tags = td_tags[i].find_all("span", class_="w_night")
try:
result = span_tags[0].i.text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
MinTempForty.append(result)
else:
MaxTempForty.append('-')
MinTempForty.append('-')
# 获取天气状况
if(1<i-z<17):
p_tags = td_tags[i].find_all("p", class_="w_tqxx")
try:
result = p_tags[0].text
except AttributeError:
result = 'unknown'
except IndexError:
result = 'unknown'
WeatherForty.append(result)
else:
WeatherForty.append('-')
i += 1
def getTime():
time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print('\n' + time + '\n')
# getFortyMessage('http://www.weather.com.cn/weather40d/101240511.shtml')
# print(dateForty)
# print(len(dateForty))
# print(MaxTempForty)
# print(len(MaxTempHistory))
# print(MinTempForty)
# print(len(MinTempHistory))
# print(PreProHistory)
# print(len(PreProHistory))
# print(MaxTempForty)
# print(len(MaxTempForty))
# print(MinTempForty)
# print(len(MinTempForty))
# print(WeatherForty)
# print(len(WeatherForty))
url = 'http://www.weather.com.cn/textFC/hb.shtml'
# 获取省级的链接
Provincehref = getProvinceUrl(getHTMLText(url))
getTime()
print(Provincehref)
# 获取区县的链接
for i in Provincehref:
CountyHref.extend(getCountyUrl(getHTMLText('http://www.weather.com.cn'+i)))
getTime()
print(CountyHref)
# 获取各省、城市、区县的名字,区县今日、四十天天气链接
i=0
while(i<len(CountyHref)):
getWeatherUrl(getHTMLText(CountyHref[i]))
i += 2
getTime()
print(ProvinceName)
print(CityName)
print(CountyName)
print(TodayUrl)
print(FortyUrl)
# 获取今日天气信息
for i in TodayUrl:
getTodayMessage('http://www.weather.com.cn'+i)
getTime()
print(RHToday)
# 存储今日天气信息
df = pd.DataFrame({'ProvinceName':ProvinceName,'CityName':CityName,'CountyName':CountyName,'nowTodayTemp':nowTodeyTemp,'RHToday':RHToday,'AQIToday':AQIToday})
df.to_csv('weather_today_message3.csv', index=False, header=True)
getTime()
# 修改ProvinceName、CityName、CountyName,为了四十天天气数据的csv文件存储
i = 0
while(i<len(ProvinceName)):
j = 1
while(j<=30):
ProvinceNameForty.append(ProvinceName[i])
CityNameForty.append(CityName[i])
CountyNameForty.append(CountyName[i])
j += 1
i += 1
getTime()
print(ProvinceNameForty)
# 获取四十天天气信息
for i in FortyUrl:
getFortyMessage('http://www.weather.com.cn'+i)
getTime()
print(dateForty)
print(len(dateForty))
print(MaxTempForty)
print(len(MaxTempHistory))
print(MinTempForty)
print(len(MinTempHistory))
print(PreProHistory)
print(len(PreProHistory))
print(MaxTempForty)
print(len(MaxTempForty))
print(MinTempForty)
print(len(MinTempForty))
print(WeatherForty)
print(len(WeatherForty))
# 存储四十天天气信息
df = pd.DataFrame({'ProvinceNameForty':ProvinceNameForty,'CityNameForty':CityNameForty,'CountyNameForty':CountyNameForty,'dateForty':dateForty,'MaxTempHistory':MaxTempHistory,'MinTempHistory':MinTempHistory,'PreProHistory':PreProHistory,'MaxTempForty':MaxTempForty,'MinTempForty':MinTempForty,'WeatherForty':WeatherForty})
df.to_csv('weather_Forty_message.csv', index=False, header=True)
getTime()
# 关闭驱动
driver.close()
3.数据清洗
当时处理数据时比较粗糙
当该区县数据中某属性出现unknown值,采用该区县所属市的数据该属性的众数替换;如若该市只有该区县,则采用该区县所属省的数据该属性的众数替换。
处理 - 值:
出现原因:直辖市或特别行政区
处理方法:其城市名使用其省名替换
4.数据分析
对今日相对湿度的东中西部及特别行政区的对比分析:
对今日空气质量指数的东中西部及特别行政区的对比分析:
使用爬取的本月天气数据集预测了后来五天的气温区间,使用了线性回归模型。
该预测效果不是很理想,当时未改进。
5.数据可视化
各区县温度的热力图:
各区县天气信息的地图展示:
6.说明
这是我当时的大作业,许多方面做的很粗糙。
数据清洗、分析、可视化模块的代码太多不便在帖子中写出,如果有需要的可以私信我。