爬取天气的网站选择的是http://henan.weather.com.cn/,爬取的结果是将河南省的各个城市保存在CSV文件中,如果想爬取其他省的天气可以修改为对应省份的网址即可。
代码如下:
import sys
import re
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
url="http://henan.weather.com.cn/"
fp=urllib.request.urlopen(url)
s=fp.read()
soup=BeautifulSoup(s,'lxml')
ls=soup.select('.navbox span a')
city=[]
date=[]
weather=[]
tep=[]
direct=[]
wind=[]
for i in ls:
city.append(i.get_text())
urlt=url+i['href']
# print(urlt)
fp1=urllib.request.urlopen(urlt)
s1=fp1.read()
soup1=BeautifulSoup(s1,'lxml')
ls1=soup1.select('.forecastBox dl dt a')
date.append(re.search(r'\d\d\d\d-\d\d-\d\d',soup1.select('.weatheH1')[0].get_text()).group(0))
urlweather=ls1[0]['href']
fp2=urllib.request.urlopen(urlweather)
s2=fp2.read()
soup2=BeautifulSoup(s2,'lxml')
ls2=soup2.select('.c7d ul li h1')
weather.append(soup2.select('.c7d ul li p')[0].get_text())
tep.append(soup2.select('.c7d ul li p i')[0].get_text())
wind.append(soup2.select('.c7d ul li p i')[1].get_text())
direct.append(soup2.select('.c7d ul li p em span')[0]['title'])
city=pd.Series(city,name='城市')
date=pd.Series(date,name='日期')
weather=pd.Series(weather,name='天气')
tep=pd.Series(tep,name='温度')
wind=pd.Series(wind,name='风力')
direct=pd.Series(direct,name='风向')
obj=pd.concat([city,date,weather,tep,wind,direct],axis=1)
obj.to_csv(r"e:\河南天气.csv",index=False,encoding='utf_8',mode='a')
CSV文件截图如下: