以下是911天气爬取的程序
爬取内容包括时间,图片,天气,温度,湿度,风力,风级,降水量,体感温度,云量,不过最近911没数据了,于是又换了天气爬取网站,对应文章:https://blog.csdn.net/weixin_43128028/article/details/108391587
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from dateutil.relativedelta import relativedelta
from datetime import datetime
class weather_data:
def __init__(self,city,start_year,end_year,start_month=1,end_month = 12):
"""
:param city: 需爬取的城市全拼
:param start_year: 爬取开始年份
:param end_year: 爬取结束年份
:param start_month: 爬取开始月份
:param end_month: 爬取结束月份
"""
self.city = city
self.start_time = datetime.strptime(f"{start_year}-{start_month}",'%Y-%m')
self.end_time = datetime.strptime(f"{end_year}-{end_month}",'%Y-%m')
def _get_original_html(self):
"""
网页爬取
"""
url = f"https://tianqi.911cha.com/{self.city}/{self.start_time.year}-{self.start_time.month}.html"
print(url)
header = {
"User-Agent": "……"}#填写自己浏览器内容
response = requests.get(url, headers=header)
return response.content.decode("utf-8")
def _parse_data(self):
#一次解析一个月
soup = BeautifulSoup(self.html,"html.parser")
data = defaultdict(dict)
for n, tr in enumerate(soup.find_all("tr")):
if n == 0:
continue
if n%2!=0:
date = tr.find("a").get_text()
#创建日期字典
#[时间,图片,天气,温度,湿度,风力,风级,降水量,体感温度,云量]
data[date]["Day"] = {str(self.start_time.year)+'-'+key:con.get_text() for key,con in zip(['time','image','weather','temperature','humidity','wind_force','wind_scale',
'precipitation','sendible_temperature','cloud_amount'],tr.find_all("td"))}
else:
data[date]["Night"] = {key: con.get_text() for key, con in zip(
['time', 'image', 'weather', 'temperature', 'humidity', 'wind_force', 'wind_scale',
'precipitation', 'sendible_temperature', 'cloud_amount'], tr.find_all("td"))}
return data
def main(self):
data = []
while self.start_time<=self.end_time:
self.html = self._get_original_html()
data.append(self._parse_data())
self.start_time+=relativedelta(months=1)
return data
if __name__ == "__main__":
T = weather_data(city="jinan",start_year=2017,end_year=2020,start_month=1,end_month=2)
with open('weather_dict.txt','w',encoding='UTF-8') as f:
for line in T.main():
f.writelines(str(line))