前言:Python 是进行数据分析和可视化的强大工具,常用的库包括 Pandas、NumPy、Matplotlib 和 Seaborn。以下是一个基本的教程概述,介绍了如何使用这些库来进行数据分析和可视化:
Python数据分析及可视化教程
1、 环境准备
- 确保已经安装了 Python 和相关库。你可以使用 pip 来安装所需的库:
pip install pandas numpy matplotlib seaborn
2、数据准备
通过一个商城数据分析和可视化的示例来展示如何使用 Python 进行数据分析和可视化。假设你有一个包含商城交易数据的 CSV 文件,文件名为 sales_data.csv,其中包含以下字段:
order_id
: 订单IDproduct
: 产品名称quantity
: 购买数量price
: 单价total_amount
: 总金额(通常是 quantity 和 price 的乘积)order_date
: 订单日期customer_id
: 顾客ID
3、开始数据分析
3.1、导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
3.2、加载数据
# 加载数据
data = pd.read_csv('sales_data.csv')
# 查看数据的前几行
print(data.head())
3.3、数据预处理
确保数据的格式和类型正确。
# 确保日期列为日期格式
data['order_date'] = pd.to_datetime(data['order_date'])
# 检查数据的基本信息
print(data.info())
# 计算总金额(如果数据中没有此列)
data['total_amount'] = data['quantity'] * data['price']
3.4、数据分析
描述性统计
# 描述性统计
print(data.describe())
# 按产品进行分组并计算总销售额
product_sales = data.groupby('product')['total_amount'].sum().reset_index()
print(product_sales)
时间序列分析
按月计算总销售额趋势:
# 以月份为单位汇总销售额
data['month'] = data['order_date'].dt.to_period('M')
monthly_sales = data.groupby('month')['total_amount'].sum().reset_index()
# 转换为 DataFrame 的日期时间格式
monthly_sales['month'] = monthly_sales['month'].dt.to_timestamp()
print(monthly_sales)
3.5、数据可视化
产品销售总额
# 绘制产品销售总额的条形图
plt.figure(figsize=(10, 6))
sns.barplot(x='product', y='total_amount', data=product_sales, palette='viridis')
plt.xticks(rotation=45)
plt.xlabel('Product')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount by Product')
plt.show()
销售额时间趋势:
# 绘制月销售额趋势图
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'], monthly_sales['total_amount'], marker='o')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.title('Monthly Sales Trend')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()
顾客购买行为分析
假设我们想分析每个顾客的购买总额:
# 计算每个顾客的总购买额
customer_sales = data.groupby('customer_id')['total_amount'].sum().reset_index()
# 绘制顾客购买总额的直方图
plt.figure(figsize=(10, 6))
sns.histplot(customer_sales['total_amount'], bins=30, kde=True)
plt.xlabel('Total Sales Amount')
plt.ylabel('Number of Customers')
plt.title('Distribution of Total Sales Amount per Customer')
plt.show()
4、总结
通过上述分析,你可以获取以下信息:
- 哪些产品的销售额最高。
- 销售额在时间上的变化趋势。
- 顾客购买金额的分布情况。
请参考整体代码,如下:
# sales_data_analysis.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 设置可视化风格
sns.set(style="whitegrid")
def load_data(file_path):
"""加载数据"""
data = pd.read_csv(file_path)
data['order_date'] = pd.to_datetime(data['order_date'])
if 'total_amount' not in data.columns:
data['total_amount'] = data['quantity'] * data['price']
return data
def preprocess_data(data):
"""数据预处理"""
# 检查缺失值
if data.isnull().sum().any():
data = data.dropna() # 或者用填充方法进行处理
return data
def analyze_data(data):
"""数据分析"""
# 按产品计算总销售额
product_sales = data.groupby('product')['total_amount'].sum().reset_index()
# 计算每月销售额趋势
data['month'] = data['order_date'].dt.to_period('M')
monthly_sales = data.groupby('month')['total_amount'].sum().reset_index()
monthly_sales['month'] = monthly_sales['month'].dt.to_timestamp()
# 计算每个顾客的总购买额
customer_sales = data.groupby('customer_id')['total_amount'].sum().reset_index()
return product_sales, monthly_sales, customer_sales
def visualize_data(product_sales, monthly_sales, customer_sales):
"""数据可视化"""
# 产品销售总额条形图
plt.figure(figsize=(12, 6))
sns.barplot(x='product', y='total_amount', data=product_sales, palette='viridis')
plt.xticks(rotation=45)
plt.xlabel('Product')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount by Product')
plt.tight_layout()
plt.show()
# 月销售额趋势图
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'], monthly_sales['total_amount'], marker='o')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.title('Monthly Sales Trend')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 顾客购买总额分布直方图
plt.figure(figsize=(12, 6))
sns.histplot(customer_sales['total_amount'], bins=30, kde=True)
plt.xlabel('Total Sales Amount')
plt.ylabel('Number of Customers')
plt.title('Distribution of Total Sales Amount per Customer')
plt.tight_layout()
plt.show()
def main():
# 文件路径
file_path = 'sales_data.csv'
# 数据加载
data = load_data(file_path)
# 数据预处理
data = preprocess_data(data)
# 数据分析
product_sales, monthly_sales, customer_sales = analyze_data(data)
# 数据可视化
visualize_data(product_sales, monthly_sales, customer_sales)
if __name__ == "__main__":
main()
解释
load_data
函数: 负责加载 CSV 文件并确保数据的日期列和总金额列格式正确。preprocess_data
函数: 处理缺失值,这里只是简单地删除缺失值。你可以根据实际情况选择合适的缺失值处理方法。analyze_data
函数: 进行数据分析,包括产品销售总额、月销售额趋势和顾客购买总额。visualize_data
函数: 绘制产品销售总额的条形图、月销售额的趋势图以及顾客购买总额的分布直方图。main
函数: 整合各个步骤,从加载数据到可视化。
使用方法:
将上述代码保存到名为 sales_data_analysis.py 的文件中。确保数据文件 sales_data.csv 位于相同目录下或提供正确的文件路径。然后在终端中运行该脚本:
python sales_data_analysis.py
5、错误处理和异常判断说明
为了使代码更加健壮,可以添加一些异常情况判断和错误处理机制。以下是包含错误处理和异常判断的完整代码示例:
# sales_data_analysis.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 设置可视化风格
sns.set(style="whitegrid")
def load_data(file_path):
"""加载数据并进行初步检查"""
try:
data = pd.read_csv(file_path)
# 确保日期列为日期格式
data['order_date'] = pd.to_datetime(data['order_date'])
# 如果没有 total_amount 列,则计算它
if 'total_amount' not in data.columns:
if 'quantity' in data.columns and 'price' in data.columns:
data['total_amount'] = data['quantity'] * data['price']
else:
raise ValueError("Data must contain 'quantity' and 'price' columns to calculate 'total_amount'.")
return data
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
raise
except pd.errors.EmptyDataError:
print("Error: The file is empty.")
raise
except pd.errors.ParserError:
print("Error: The file could not be parsed.")
raise
except Exception as e:
print(f"An unexpected error occurred while loading data: {e}")
raise
def preprocess_data(data):
"""数据预处理"""
try:
# 检查缺失值
if data.isnull().sum().any():
print("Warning: Missing values detected. Dropping rows with missing values.")
data = data.dropna() # 或者用填充方法进行处理
return data
except Exception as e:
print(f"An error occurred during data preprocessing: {e}")
raise
def analyze_data(data):
"""数据分析"""
try:
# 按产品计算总销售额
product_sales = data.groupby('product')['total_amount'].sum().reset_index()
# 计算每月销售额趋势
data['month'] = data['order_date'].dt.to_period('M')
monthly_sales = data.groupby('month')['total_amount'].sum().reset_index()
monthly_sales['month'] = monthly_sales['month'].dt.to_timestamp()
# 计算每个顾客的总购买额
customer_sales = data.groupby('customer_id')['total_amount'].sum().reset_index()
return product_sales, monthly_sales, customer_sales
except KeyError as e:
print(f"Error: Missing expected column in data: {e}")
raise
except Exception as e:
print(f"An error occurred during data analysis: {e}")
raise
def visualize_data(product_sales, monthly_sales, customer_sales):
"""数据可视化"""
try:
# 产品销售总额条形图
plt.figure(figsize=(12, 6))
sns.barplot(x='product', y='total_amount', data=product_sales, palette='viridis')
plt.xticks(rotation=45)
plt.xlabel('Product')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount by Product')
plt.tight_layout()
plt.show()
# 月销售额趋势图
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'], monthly_sales['total_amount'], marker='o')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.title('Monthly Sales Trend')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 顾客购买总额分布直方图
plt.figure(figsize=(12, 6))
sns.histplot(customer_sales['total_amount'], bins=30, kde=True)
plt.xlabel('Total Sales Amount')
plt.ylabel('Number of Customers')
plt.title('Distribution of Total Sales Amount per Customer')
plt.tight_layout()
plt.show()
except Exception as e:
print(f"An error occurred during data visualization: {e}")
raise
def main():
# 文件路径
file_path = 'sales_data.csv'
try:
# 数据加载
data = load_data(file_path)
# 数据预处理
data = preprocess_data(data)
# 数据分析
product_sales, monthly_sales, customer_sales = analyze_data(data)
# 数据可视化
visualize_data(product_sales, monthly_sales, customer_sales)
except Exception as e:
print(f"An error occurred during the process: {e}")
if __name__ == "__main__":
main()
说明
1、load_data
函数:
- 捕获
FileNotFoundError
错误,当文件不存在时给出提示。 - 捕获
pd.errors.EmptyDataError
错误,当文件为空时给出提示。 - 捕获
pd.errors.ParserError
错误,当文件无法解析时给出提示。 - 捕获其他未预见的错误并给出提示。
2、preprocess_data
函数:
- 捕获数据预处理过程中可能出现的任何错误,并给出提示。
3、analyze_data
函数:
- 捕获
KeyError
错误,提醒缺少预期的列。 - 捕获其他分析过程中的错误并给出提示。
4、visualize_data
函数:
- 捕获可视化过程中的任何错误,并给出提示。
5、main
函数:
- 捕获整个流程中的任何未预见的错误,并给出提示。
6、打印日志
为了在脚本中添加打印日志,可以使用 Python 的 logging 模块来记录信息。logging 模块允许你记录不同级别的日志信息(如 DEBUG、INFO、WARNING、ERROR 和 CRITICAL),并将日志写入文件或打印到控制台。
# sales_data_analysis.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
# 设置可视化风格
sns.set(style="whitegrid")
# 配置日志记录
logging.basicConfig(
filename='sales_data_analysis.log',
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def load_data(file_path):
"""加载数据并进行初步检查"""
try:
logging.info(f"Loading data from {file_path}.")
data = pd.read_csv(file_path)
# 确保日期列为日期格式
data['order_date'] = pd.to_datetime(data['order_date'])
# 如果没有 total_amount 列,则计算它
if 'total_amount' not in data.columns:
if 'quantity' in data.columns and 'price' in data.columns:
data['total_amount'] = data['quantity'] * data['price']
else:
raise ValueError("Data must contain 'quantity' and 'price' columns to calculate 'total_amount'.")
logging.info("Data loaded successfully.")
return data
except FileNotFoundError:
logging.error(f"File '{file_path}' not found.")
raise
except pd.errors.EmptyDataError:
logging.error("The file is empty.")
raise
except pd.errors.ParserError:
logging.error("The file could not be parsed.")
raise
except Exception as e:
logging.error(f"An unexpected error occurred while loading data: {e}")
raise
def preprocess_data(data):
"""数据预处理"""
try:
# 检查缺失值
if data.isnull().sum().any():
logging.warning("Missing values detected. Dropping rows with missing values.")
data = data.dropna() # 或者用填充方法进行处理
return data
except Exception as e:
logging.error(f"An error occurred during data preprocessing: {e}")
raise
def analyze_data(data):
"""数据分析"""
try:
# 按产品计算总销售额
product_sales = data.groupby('product')['total_amount'].sum().reset_index()
# 计算每月销售额趋势
data['month'] = data['order_date'].dt.to_period('M')
monthly_sales = data.groupby('month')['total_amount'].sum().reset_index()
monthly_sales['month'] = monthly_sales['month'].dt.to_timestamp()
# 计算每个顾客的总购买额
customer_sales = data.groupby('customer_id')['total_amount'].sum().reset_index()
logging.info("Data analysis completed successfully.")
return product_sales, monthly_sales, customer_sales
except KeyError as e:
logging.error(f"Missing expected column in data: {e}")
raise
except Exception as e:
logging.error(f"An error occurred during data analysis: {e}")
raise
def visualize_data(product_sales, monthly_sales, customer_sales):
"""数据可视化"""
try:
# 产品销售总额条形图
plt.figure(figsize=(12, 6))
sns.barplot(x='product', y='total_amount', data=product_sales, palette='viridis')
plt.xticks(rotation=45)
plt.xlabel('Product')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount by Product')
plt.tight_layout()
plt.show()
logging.info("Product sales bar plot created.")
# 月销售额趋势图
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'], monthly_sales['total_amount'], marker='o')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.title('Monthly Sales Trend')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
logging.info("Monthly sales trend line plot created.")
# 顾客购买总额分布直方图
plt.figure(figsize=(12, 6))
sns.histplot(customer_sales['total_amount'], bins=30, kde=True)
plt.xlabel('Total Sales Amount')
plt.ylabel('Number of Customers')
plt.title('Distribution of Total Sales Amount per Customer')
plt.tight_layout()
plt.show()
logging.info("Customer total sales amount histogram created.")
except Exception as e:
logging.error(f"An error occurred during data visualization: {e}")
raise
def main():
# 文件路径
file_path = 'sales_data.csv'
try:
# 数据加载
data = load_data(file_path)
# 数据预处理
data = preprocess_data(data)
# 数据分析
product_sales, monthly_sales, customer_sales = analyze_data(data)
# 数据可视化
visualize_data(product_sales, monthly_sales, customer_sales)
except Exception as e:
logging.error(f"An error occurred during the process: {e}")
if __name__ == "__main__":
main()