基于python的天气网站数据爬取和可视化分析项目

浅唱王

已于 2024-05-08 21:30:01 修改

阅读量943

点赞数 3

文章标签： python 信息可视化数据分析

于 2024-04-29 12:17:20 首次发布

本文链接：https://blog.csdn.net/sjsnbwjsj/article/details/138308480

版权

基于python的天气网站数据爬取和可视化分析项目 2024.03-2024.04
项目职责：策划者和执行者
项目地址:
项目描述：本项目的目的是为了通过爬虫技术来爬取中国气象网的天气数据，并且使用pandas,pyecharts进行

数据可视化，分析天气数据。

项目环境：pycharm，python3.9 requests bs pandas jupyter-notebook pyecharts,腾讯云，mariadb等
项目步骤：

1.数据爬取：使用requests库模拟web浏览器进行访问中国气象网爬取数据，使用BeautifulSoup和lxml进行数据提取，包括日期、气温、风向、风级、风速、气压、降水，pm25等。

将常用的函数封装成模块--》导入

"""
@author: wangyalin
@file: download.py
@time: 2023/11/30 10:42
    把可重复使用的功能进行封装
    download模块（文件名：符合标识符的命名规则）
"""
import requests
import time
import logging

# 获取文本
def download(url):
    #
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
    }
    try:
        # try内部写有可能会出错的代码
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        # 设置编码格式
        response.encoding = response.apparent_encoding
        # 返回文本数据
        time.sleep(1)
        # print(url, "访问成功")
        logging.info(url+"访问成功")
        return response.text
    except:
        # 如果出错了会执行这里
        # print(url, "访问错误")
        logging.error(url+"访问错误")
        return ""


# 获取二进制
def download_img(url):
    #
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
    }
    try:
        # try内部写有可能会出错的代码
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        # 设置编码格式
        response.encoding = response.apparent_encoding
        # 返回文本数据
        time.sleep(1)
        logging.info(url+"访问成功")
        return response.content
    except:
        # 如果出错了会执行这里
        logging.error(url + "访问错误")
        return ""

2.数据存储：将爬取的数据存储到腾讯云(cos),其他数据存储到本地mariadb数据库上，我成功爬取了中国气象网北京等几个城市的数据，为后续数据分析提供数据支撑。

上传本地mysql服务器做保存数据

"""
@author: wangyalin
@file: pyecharts_test.py
@time: 2024/5/8 18:11
"""
import pandas as pd
from pyecharts.charts import Bar, Pie
from pyecharts import options as opts

# 读取 CSV 文件
df = pd.read_csv("weacher7.csv")

# 削去全为空的列和行
df.dropna(axis="columns", how="all", inplace=True)
df.dropna(axis="index", how="all", inplace=True)

# 添加温度类型列
def get_wendu_type(x):
    if x["最高温度"] > 28:
        return "高温"
    elif x["最低温度"] < 10:
        return "低温"
    else:
        return "常温"
df["温度类型"] = df.apply(get_wendu_type, axis=1)

# 设置索引为城市
df.set_index('城市', inplace=True)

# 提取北京数据
df_beijing = df.loc['北京']

# 绘制北京温度类型占比饼图
y_data_pie = df_beijing.groupby("温度类型")["日期"].count()
datas_pie = [(item, int(y_data_pie[item])) for item in y_data_pie.index]
pie_chart = (
    Pie()
    .add("温度类型占比", datas_pie, percent_precision=1)
    .set_global_opts(title_opts=opts.TitleOpts(title="北京温度类型占比"))
    .set_series_opts(
        label_opts=opts.LabelOpts(formatter="{b}:{c} {d}%"),
        tooltip_opts=opts.TooltipOpts(formatter="{b}:{d}%")
    )
)

# 绘制北京天气温度类型柱状图
y_data_bar = df_beijing.groupby("温度类型")["日期"].count().sort_values()
data_bar = [int(y_data_bar[item]) for item in y_data_bar.index]
bar_chart = (
    Bar()
    .add_xaxis(list(y_data_bar.index))
    .add_yaxis("北京", data_bar)
    .set_global_opts(title_opts=opts.TitleOpts(title="北京天气", subtitle="温度类型统计"))
)

# 温度变化曲线图
data_line = df.groupby("日期").agg({"最高温度": "max", "最低温度": "min"})
line_chart = data_line.plot()

import matplotlib.pyplot as plt
# 设置字体
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]  # 使用微软雅黑字体
plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题

# 保存图片
line_chart.figure.savefig("温度变化曲线图.png")

# 渲染图表
pie_chart.render("北京温度类型占比.html")
bar_chart.render("北京天气温度类型统计.html")
line_chart.figure.savefig("温度变化曲线图.png")

同时备份数据到腾讯云服务器--》做备份和冗余


from qcloud_cos import CosConfig
from qcloud_cos import CosS3Client
from qcloud_cos.cos_exception import CosClientError, CosServiceError
import sys
import os
import logging

# 正常情况日志级别使用 INFO，需要定位时可以修改为 DEBUG，此时 SDK 会打印和服务端的通信信息
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
# 修改secret_id, secret_key, region
# 1. 设置用户属性, 包括 secret_id, secret_key, region 等。Appid 已在 CosConfig 中移除，请在参数 Bucket 中带上 Appid。Bucket 由 BucketName-Appid 组成
secret_id = os.environ[
    'COS_SECRET_ID']  # 用户的 SecretId，建议使用子账号密钥，授权遵循最小权限指引，降低使用风险。子账号密钥获取可参见 https://cloud.tencent.com/document/product/598/37140
# secret_id = "AKID63mIafW2KOGvNcgW8WheYFiwwjHagZEf"     # 用户的 SecretId，建议使用子账号密钥，授权遵循最小权限指引，降低使用风险。子账号密钥获取可参见 https://cloud.tencent.com/document/product/598/37140
secret_key = os.environ[
    'COS_SECRET_KEY']  # 用户的 SecretKey，建议使用子账号密钥，授权遵循最小权限指引，降低使用风险。子账号密钥获取可参见 https://cloud.tencent.com/document/product/598/37140
# secret_key = "FXcS8b5LNAj4A2g993kDRRCpFSUedVBg"   # 用户的 SecretKey，建议使用子账号密钥，授权遵循最小权限指引，降低使用风险。子账号密钥获取可参见 https://cloud.tencent.com/document/product/598/37140
region = 'ap-guangzhou'  # 替换为用户的 region，已创建桶归属的 region 可以在控制台查看，https://console.cloud.tencent.com/cos5/bucket
# COS 支持的所有 region 列表参见 https://cloud.tencent.com/document/product/436/6224
token = None  # 如果使用永久密钥不需要填入 token，如果使用临时密钥需要填入，临时密钥生成和使用指引参见 https://cloud.tencent.com/document/product/436/14048
scheme = 'https'  # 指定使用 http/https 协议来访问 COS，默认为 https，可不填

config = CosConfig(Region=region, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme)
client = CosS3Client(config)
import os

def upload_folder(remote_dir, local_dir, bucket="wang-1325844428"):
    for root, dirs, files in os.walk(local_dir):
        for file in files:
            local_path = os.path.join(root, file)
            # 使用相对路径作为 COS 中的存储路径
            relative_path = os.path.relpath(local_path, local_dir)
            key = os.path.join(remote_dir, relative_path).replace("\\", "/")  # 统一使用 "/" 分隔符
            upload(key, local_path, bucket)

def upload(key, local_path, bucket="wang-1325844428"):
    # 使用高级接口断点续传，失败重试时不会上传已成功的分块(这里重试10次)
    for i in range(0, 10):
        try:
            response = client.upload_file(
                Bucket=bucket,
                Key=key,
                LocalFilePath=local_path)
            break
        except (CosClientError, CosServiceError) as e:
            print(e)
    url = f"{scheme}://{bucket}.cos.{region}.myqcloud.com/{key}"
    return url

# 使用示例
upload_folder("爬虫项目项目汇总", "./爬虫项目项目汇总")

3.数据清洗和整理：利用pandas对爬取的数据进行清洗和整理，去除重复和异常值，保证数据的可靠性。

4.数据可视化：利用pyecharts库将清洗后的数据以图表的形式展示，例如折线图展示一年气温变化曲线，饼图展示

使用pandas做数据清洗

使用pyecharts进行数据可视化

matplotlib--》绘制图片

"""
@author: wangyalin
@file: pyecharts_test.py
@time: 2024/5/8 18:11
"""
import pandas as pd
from pyecharts.charts import Bar, Pie
from pyecharts import options as opts

# 读取 CSV 文件
df = pd.read_csv("weacher7.csv")

# 削去全为空的列和行
df.dropna(axis="columns", how="all", inplace=True)
df.dropna(axis="index", how="all", inplace=True)

# 添加温度类型列
def get_wendu_type(x):
    if x["最高温度"] > 28:
        return "高温"
    elif x["最低温度"] < 10:
        return "低温"
    else:
        return "常温"
df["温度类型"] = df.apply(get_wendu_type, axis=1)

# 设置索引为城市
df.set_index('城市', inplace=True)

# 提取北京数据
df_beijing = df.loc['北京']

# 绘制北京温度类型占比饼图
y_data_pie = df_beijing.groupby("温度类型")["日期"].count()
datas_pie = [(item, int(y_data_pie[item])) for item in y_data_pie.index]
pie_chart = (
    Pie()
    .add("温度类型占比", datas_pie, percent_precision=1)
    .set_global_opts(title_opts=opts.TitleOpts(title="北京温度类型占比"))
    .set_series_opts(
        label_opts=opts.LabelOpts(formatter="{b}:{c} {d}%"),
        tooltip_opts=opts.TooltipOpts(formatter="{b}:{d}%")
    )
)

# 绘制北京天气温度类型柱状图
y_data_bar = df_beijing.groupby("温度类型")["日期"].count().sort_values()
data_bar = [int(y_data_bar[item]) for item in y_data_bar.index]
bar_chart = (
    Bar()
    .add_xaxis(list(y_data_bar.index))
    .add_yaxis("北京", data_bar)
    .set_global_opts(title_opts=opts.TitleOpts(title="北京天气", subtitle="温度类型统计"))
)

# 温度变化曲线图
data_line = df.groupby("日期").agg({"最高温度": "max", "最低温度": "min"})
line_chart = data_line.plot()

import matplotlib.pyplot as plt
# 设置字体
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]  # 使用微软雅黑字体
plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题

# 保存图片
line_chart.figure.savefig("温度变化曲线图.png")

# 渲染图表
pie_chart.render("北京温度类型占比.html")
bar_chart.render("北京天气温度类型统计.html")
line_chart.figure.savefig("温度变化曲线图.png")

一年中的气温分布，柱形图展示空气以质量排序，地图来展示地区的天气质量，以便更好的观测数据。

实现的图片：

http://localhost:63342/python/%E7%88%AC%E8%99%AB%E9%A1%B9%E7%9B%AE/%E7%88%AC%E8%99%AB%E4%B8%8A%E8%AF%BE%E4%BB%A3%E7%A0%81-20240115/%E5%8C%97%E4%BA%AC%E5%A4%A9%E6%B0%94%E6%B8%A9%E5%BA%A6%E7%B1%BB%E5%9E%8B%E7%BB%9F%E8%AE%A1.html?_ijt=muv8o99t5bhm044riji2snb8ft