pip install安装:requests,beautifulsoup4,lxml
打开开发人员工具:
点击网络后刷新>>点击文档>>点击左侧的hb.shtml>>点击预览,可以看到需要采集的数据都在此页面中>>点击标头,可以看到此页面对应的请求url,请求方式,Cookie,User-Agent
点击页面左侧“小鼠标”进行定位,缩小查找范围,逐步找到每条要爬取的数据的位置
# http://www.weather.com.cn/textFC/hb.shtml
import csv
import requests
from bs4 import BeautifulSoup
# requests
# 请求地址
url_weather = 'http://www.weather.com.cn/textFC/hb.shtml'
# 请求头信息
headers_weather = {
'Cookie':'userNewsPort0=1; f_city=%E6%88%90%E9%83%BD%7C101270101%7C; Hm_lvt_080dabacb001ad3dc8b9b9049b36d43b=1718765458,1718768689; Hm_lpvt_080dabacb001ad3dc8b9b9049b36d43b=1718776730',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
}
# 发送get请求,获取响应结果
response = requests.get(url=url_weather, headers=headers_weather)
# 打印响应信息
print('响应状态码:', response.status_code)
print('响应编码:', response.encoding)
# 设置编码格式
response.encoding = 'UTF-8'
# 打印响应结果
# print('响应数据:',response.text)
# BeautifulSoup4
# 将HTML转化为BeautifulSoup对象
soup = BeautifulSoup(markup=response.text, features='lxml')
print(soup.html.head.title)
# 查找天津的天气表
tables = soup.find_all('table')
tables = tables[0:5]
with open(file='weather.csv', mode='w', encoding='UTF-8', newline='') as file:
writer = csv.writer(file)
for table_city in tables:
# 查找天气数据的标签
trs = table_city.find_all('tr')[2:]
# 爬取的省
province = trs[0].find('td', class_="rowsPan").a.string
# 循环写入
for tr in trs:
tds = tr.select('td:not(.rowsPan)')
city = tds[0].a.string
weather_day = tds[1].string
wind_day = tds[2].span.string
wind_day_level = tds[2].find('span', class_='conMidtabright').string
temperature_day = tds[3].string
detail_url = tds[7].a.attrs['href']
print(
'获取到城市数据:province={}, city={}, weather_day={}, wind_day={}, wind_day_level={}, teprature_day={}, detail_url={}'
.format(province, city, weather_day, wind_day, wind_day_level, temperature_day, detail_url)
)
# 写入数据
writer.writerow([province, city, weather_day, wind_day, wind_day_level, temperature_day, detail_url])
Pycharm中运行,生成一个weather.CSV文件: