Python爬虫抓取天气信息

最新推荐文章于 2025-03-12 19:53:16 发布

boluo_people12345

最新推荐文章于 2025-03-12 19:53:16 发布

阅读量801

点赞数 10

文章标签：爬虫 python

本文链接：https://blog.csdn.net/2302_81910218/article/details/143270419

版权

废话不多说，直接上代码，我们爬取的是：衡阳天气预报,衡阳7天天气预报,衡阳15天天气预报,衡阳天气查询

，这个网站其他类型的页面也可以爬取。

import requests
from bs4 import BeautifulSoup

# 函数作用：获取一个网页的天气信息
# 参数：html_url -- 要爬取的网页网址
# 返回值：类型：list -- 一个列表，每个元素是字典类型，每个元素存储的一天的天气信息
def get_html_weather_info(html_url):
    ret_info = []   # 返回的数据类型是列表，每一个元素都是字典，存储的当天的天气信息
    url = html_url  # 天气网页地址
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Charset': 'utf-8'
    }
    response = requests.get(url=url, headers=headers)
    if response.status_code != 200:  # 请求失败
        print(f'GET请求网页失败({response.status_code})：{url}')
        exit()
    soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
    res_list1 = soup.find_all('li', class_='sky skyid lv3 on')
    res_list2 = soup.find_all('li', class_='sky skyid lv3')
    res_list3 = soup.find_all('li', class_='sky skyid lv2')
    total_result = res_list1 + res_list2 + res_list3
    for cur_li_itme in total_result:
        # 获取日期
        date = cur_li_itme.find('h1').get_text()
        # 获取天气
        weather = cur_li_itme.find('p', class_='wea').get_text()
        # 获取起始温度
        first_temp = cur_li_itme.find('p', class_='tem').find('span').get_text()
        last_temp = cur_li_itme.find('p', class_='tem').find('i').get_text()
        # 获取起始风向
        wind_direct_items = cur_li_itme.find('p', class_='win').find('em').find_all('span')
        first_wind_direct = wind_direct_items[0]['title']
        last_wind_direct = wind_direct_items[1]['title']
        # 获取风速
        wind_speed = cur_li_itme.find('p', class_='win').find('i').get_text()
        # 存贮好数据，用来返回
        ret_info.append({'date': date,
            'weather': weather,
            'first_temp':first_temp,
            'last_temp':last_temp,
            'first_wind_direct':first_wind_direct,
            'last_wind_direct':last_wind_direct,
            'wind_speed': wind_speed
            })
    # 返回一个字典，存储着页面的信息
    return ret_info

if __name__ == '__main__':
    url = 'https://www.weather.com.cn/weather/101160101.shtml'  # 天气网页地址
    # 爬取
    hengyang_wea_info = get_html_weather_info(url)
    # 打印
    print(hengyang_wea_info)

其他地区的同类型页面也可以爬取哟，调用get_html_weather_info这一个函数就OK了，这个代码直接复制到pycharm中就可以运行了!!!能直接跑。