最近在学习Python,学完基础教程后,在网上看了下如何进阶,大家推荐的基本都是从爬虫开始,容易上手还有成就感,不至于那么容易中途放弃。所以我开始学习了爬虫,零基础学习经历:
1、网上搜了一些爬虫的教学视频,基本都是有些python基础的就可以做的。
2、跟着视频学习,做一些基础练习,像我学习的基础,requests,urllib,还有scrapy框架(小爬虫基本用不上),数据清洗学的,lxml,bs4,以及正则表达式。
3、试着做一些实例,找一些简单的网页爬取一些信息,做着做着发现,数据获取变得简单,数据清洗才是工作量。
下面分享我做的爬取天气预报,觉得这是我目前能做的最好的了。
import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
#定义请求函数
def request_url(url):
header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
response=requests.get(url,headers=header)
return response
"""
def file_load(words):
with open ("file2.html",'w',encoding=("UTF-8")) as f:
f.write(words)
"""
#获取省和直辖市名称及链接,返回城市名和url所对应的字典
cityhtml="""<div class="nav-province"><span class="expand"></span>北京市</div>
<ul>
<li><a href="/publish/forecast/ABJ/beijing.html" stationid="54511">北京</a></li>
<li><a href="/publish/forecast/ABJ/chaoyang.html" stationid="54433">朝阳</a></li>
<li><a href="/publish/forecast/ABJ/fengtai.html" stationid="54514">丰台</a></li>
<li><a href="/publish/forecast/ABJ/haidian.html" stationid="54399">海淀</a></li>
<li><a href="/publish/forecast/ABJ/shijingshan.html" stationid="54513">石景山</a></li>
<li><a href="/publish/forecast/ABJ/shunyi.html" stationid="54398">顺义</a></li>
<li><a href="/publish/forecast/ABJ/changping.html" stationid="54499">昌平</a></li>
<li><a href="/publish/forecast/ABJ/daxing.html" stationid="54594">大兴</a></li>
<li><a href="/publish/forecast/ABJ/fangshan.html" stationid="54596">房山</a></li>
<li><a href="/publish/forecast/ABJ/huairou.html" stationid="54419">怀柔</a></li>
<li><a href="/publish/forecast/ABJ/mentougou.html" stationid="54505">门头沟</a></li>
<li><a href="/publish/forecast/ABJ/miyun.html" stationid="54416">密云</a></li>
<li><a href="/publish/forecast/ABJ/pinggu.html" stationid="54424">平谷</a></li>
<li><a href="/publish/forecast/ABJ/tongzhou1.html" stationid="54431">通州</a></li>
<li><a href="/publish/forecast/ABJ/yanqing.html" stationid="54406">延庆</a></li>
</ul>
<div class="nav-province"><span class="collapsed"></span>天津市</div>
<ul style="display:none;">
<li><a href="/publish/forecast/ATJ/tianjin.html" stationid="54517">天津</a></li>
<li><a href="/publish/forecast/ATJ/xiqing.html" stationid="54527">西青</a></li>
<li><a href="/publish/forecast/ATJ/baodi.html" stationid="54525">宝坻</a></li>
<li><a href="/publish/forecast/ATJ/beichen.html" stationid="54528">北辰</a></li>
<li><a href="/publish/forecast/ATJ/binhaixinqu.html" stationid="54623">滨海新区</a></li>
<li><a href="/publish/forecast/ATJ/dongli.html" stationid="54526">东丽</a></li>
<li><a href="/publish/forecast/ATJ/jixian2.html" stationid="54428">蓟县</a></li>
<li><a href="/publish/forecast/ATJ/jinnan.html" stationid="54622">津南</a></li>
<li><a href="/publish/forecast/ATJ/jinghai.html" stationid="54619">静海</a></li>
<li><a href="/publish/forecast/ATJ/ninghe.html" stationid="54529">宁河</a></li>
<li><a href="/publish/forecast/ATJ/wuqing.html" stationid="54523">武清</a></li>
<li><a href="/publish/forecast/ATJ/dagang.html" stationid="54645">大港</a></li>
<li><a href="/publish/forecast/ATJ/hangu.html" stationid="54530">汉沽</a></li>
</ul>"""
def city_nameinfo():
cityinfo = {}
allcity = {}
cityparentnames = []
i = 0
url = "http://m.nmc.cn/f/forecast/selectCity"
# 构建bs4对象
html = BeautifulSoup(request_url(url).content, "lxml")
big_city_name = html.find_all("div", class_="nav-province")
# 获取所有省份,直辖市
for x in big_city_name:
cityparentnames.append(x.text)
# 获取所有城市信息
cityname = html.find_all("ul")
for name in cityname:
city_names = name.find_all("li")
allcity[cityparentnames[i]] = {}
for x in city_names:
cityinfo[x.text] = x.a.attrs["href"]
#构建城市结构
allcity[cityparentnames[i]][x.text] = x.a.attrs["href"]
i += 1
return cityinfo
#获取城市天气信息
weatherhtml="""
<ul class="weather_datemate">
<li> <p class="weather_w_20"> <strong> 今天 </strong> <span>12.29</span> </p> <p class="weather_w_25"> <span class="same"> - / 多云 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> - - / 无持续风向 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> - / -6℃ </span> </p> </li>
<li> <p class="weather_w_20"> <strong> 明天 </strong> <span>12.30</span> </p> <p class="weather_w_25"> <span class="same"> 多云 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> 无持续风向 3~4级 / 无持续风向 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> 0℃ / -9℃ </span> </p> </li>
<li> <p class="weather_w_20"> <strong> 后天 </strong> <span>12.31</span> </p> <p class="weather_w_25"> <span class="same"> 多云 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> 无持续风向 微风 / 北风 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> -1℃ / -7℃ </span> </p> </li>
<li> <p class="weather_w_20"> <strong> 星期三 </strong> <span>01.01</span> </p> <p class="weather_w_25"> <span class="same"> 晴 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> 北风 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> 3℃ / -5℃ </span> </p> </li>
<li> <p class="weather_w_20"> <strong> 星期四 </strong> <span>01.02</span> </p> <p class="weather_w_25"> <span class="same"> 晴 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> 南风 微风 / 北风 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> 4℃ / -5℃ </span> </p> </li>
<li> <p class="weather_w_20"> <strong> 星期五 </strong> <span>01.03</span> </p> <p class="weather_w_25"> <span class="same"> 晴 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> 南风 微风 / 北风 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> 6℃ / -3℃ </span> </p> </li>
<li> <p class="weather_w_20"> <strong> 星期六 </strong> <span>01.04</span> </p> <p class="weather_w_25"> <span class="same"> 多云 </span> </p> <p class="weather_w_35" style="width: 25%;"> <span class="same"> 南风 微风 / 北风 微风 </span> </p> <p class="weather_w_20" style="text-align: right; width: 30%;"> <span class="same"> 3℃ / -4℃ </span> </p> </li>
</ul>"""
#获取城市一周内天气信息,返回每天对应天气信息的字典字典
def weatherinfo(cityurl,cityname):
html = BeautifulSoup(request_url(cityurl).content, "lxml")
weatherinfo1 = html.find('ul', class_="weather_datemate")
weatherinfo2 = weatherinfo1.find_all('li')
city = cityname
cityweather = {}
for i in weatherinfo2:
weatherinfo = i.find_all('span')
num = 1
for w in weatherinfo:
if num == 1:
date=w.text
cityw = {city + date: {}}
cityw[city + date]['date'] = w.text
if num == 2:
cityw[city + date]['weather'] = w.text
if num == 3:
cityw[city + date]['wind'] = w.text
if num == 4:
cityw[city + date]['temperature'] = w.text
num += 1
#一个城市每天的天气信息,不知道为什么无法在一个字典中保存多天的信息,故把每天的信息同步到另一个字典中的以保存全部
cityweather.update(cityw)
cityw.clear()
return cityweather
if __name__ == "__main__" :
#获取城市信息
citynames=city_nameinfo()
#输入要查询的城市
while True:
try:
cityname = str(input("please input you chiose cityname:"))
if cityname not in citynames:
raise ValueError()
break
except ValueError:
print("您输入的不是城市名,请再次尝试输入!")
#构建要查询城市的URL
cityurl=r"http://m.nmc.cn"+citynames[cityname]
#获取城市天气信息
cityweather=weatherinfo(cityurl,cityname)
#输入要查询的日期
# 如果日期未输入,则返回七天内天气信息,否则返回查询日期的天气信息
while True:
try:
date=input("please input you chiose date:(format:mm.dd):")
if date=='':
print(cityweather)
break
elif cityname+date not in cityweather:
raise ValueError()
print(cityweather[cityname + date])
break
except ValueError:
print("您输入日期不在查询期间或者格式不对,请再次尝试输入!")