智慧海洋竞赛实践 专题二
文章目录
1. 背景介绍
本专题主要介绍探索性数据分析(EDA),通过数据分析,可以熟悉数据集的基本情况(缺失值、异常值),了解特征之间的相关性、分布,以及特征与预测值之间的关系,为进行特征工程提供理论依据。
2.数据概述
2.1 读入数据
2.1.1 准备工作
import read_all_data
将read_all_data.py文件存入jupyter notebook工作路径中,并修改.py文件中数据存放的路径
# -*- codeing = utf-8 -*-
# @Time : 2021/3/7 22:23
# @Author : Evan_wyl
# @File : read_all_data.py
import pandas as pd
def read_train_file(filename=None):
# 替换数据存放的路径
Path = "C://Users//Administrator//Desktop//windomsOcean//data//hy_round1_train_20200102//"
return pd.read_csv(Path + filename,encoding="utf-8")
def read_test_file(filename=None):
# 替换数据存放的路径
Path = "C://Users//Administrator//Desktop//windomsOcean//data//hy_round1_testA_20200102//"
return pd.read_csv(Path + filename,encoding="utf-8")
2.1.2 读入函数
1. 训练集
# 存放数据的绝对路径
train_path = "C://Users//Administrator//Desktop//windomsOcean//data//hy_round1_train_20200102//"
data_train = read_data(train_path,Kind="train")
data_train = pd.concat(data_train)
RUN:
2. 测试集
# 存放数据的绝对路径
test_path = "C://Users//Administrator//Desktop//windomsOcean//data//hy_round1_testA_20200102//"
data_test = read_data(test_path,Kind="test")
data_test = pd.concat(data_test)
RUN:
2.2 数据概览
# 数据维度
print(data_train.shape)
print(data_test.shape)
# 数据列名
print(data_train.columns)
print(data_test.columns)
# 字段类型
data_train.info()
RUN:
(2699638, 7)
(782378, 6)
Index(['渔船ID', 'x', 'y', '速度', '方向', 'time', 'type'], dtype='object')
Index(['渔船ID', 'x', 'y', '速度', '方向', 'time'], dtype='object')
描述性统计
# 设置数值展示分位数(默认只对数值型进行描述统计)
data_train.describe([0.01,0.025,0.05,0.5,0.75,0.9,0.99])
3.数据处理
3.1缺失值判断
从.info()中Non-Null Count可以看到数据集没有缺失值:
# 字段类型
data_train.info()
也可以通过:
data_train.isnull().any()
3.2异常值判断
判断是否存在唯一值的列:
[col for col in data_train.columns if data_train[col].nunique() <= 1]# 其中data_train[col].nunique()等价于len(data_train[col].unique())表示这一列中有多少中取值
4.可视化展示
4.1 轨迹可视化
fig,axes = plt.subplots(nrows=3,ncols=3,figsize=(20,15))
plt.subplots_adjust(wspace=0.2,hspace=0.2)
# 对于每一个类别,随机选出刺网的三条轨迹进行可视化
lables = ["ciwang","weiwang","tuowang"]
for i,file_type in tqdm(enumerate(["ciwang_data","weiwang_data","tuowang_data"])):
data1, data2, data3 = get_random_three_traj(type=file_type)
for j, datax in enumerate([data1, data2, data3]):
x_data = datax["x"].loc[-1:].values
y_data = datax["y"].loc[-1:].values
axes[i][j - 1].scatter(x_data[0], y_data[0], label="start", c="red", s=10, marker="8")
axes[i][j - 1].plot(x_data, y_data, label=lables[i])
axes[i][j - 1].scatter(x_data[len(x_data) - 1], y_data[len(y_data) - 1], label="end", c="green", s=10,
marker="v")
axes[i][j - 1].grid(alpha=2)
axes[i][j - 1].legend(loc="best")
plt.show()
4.2 坐标可视化
fig,axes = plt.subplots(nrows=2,ncols=1,figsize=(10,8))
plt.subplots_adjust(wspace=0.5,hspace=0.5)
data1 = get_random_one_traj(type="weiwang_data")
x = data1["x"].loc[-1:]
x = x / 10000
y = data1["y"].loc[-1:]
y = y / 10000
arr1 = np.arange(len(x))
arr2 = np.arange(len(y))
axes[0].plot(arr1,x,label="x")
axes[1].plot(arr2,y,label="y")
axes[0].grid(alpha=3)
axes[0].legend(loc="best")
axes[1].grid(alpha=3)
axes[1].legend(loc="best")
plt.show()
4.3 速度与方向
4.3.1 速度与方向可视化
fig,axes = plt.subplots(nrows=3,ncols=2,figsize=(20,15))
plt.subplots_adjust(wspace=0.3,hspace=0.3)
# 随机选出刺网的三条轨迹进行可视化
file_types = ["ciwang_data","weiwang_data","tuowang_data"]
speed_types = ["ciwang_speed","weiwang_speed","tuowang_speed"]
doirections = ["ciwang_direction","weiwang_direction","tuowang_direction"]
colors = ['pink', 'lightblue', 'lightgreen']
for i,file_name in tqdm(enumerate(file_types)):
datax = get_random_one_traj(type=file_name)
x_data = datax["速度"].loc[-1:].values
y_data = datax["方向"].loc[-1:].values
axes[i][0].plot(range(len(x_data)), x_data, label=speed_types[i], color=colors[i])
axes[i][0].grid(alpha=2)
axes[i][0].legend(loc="best")
axes[i][1].plot(range(len(y_data)), y_data, label=doirections[i], color=colors[i])
axes[i][1].grid(alpha=2)
axes[i][1].legend(loc="best")
plt.show()
4.3.2 速度与方向数据分布图
plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
plt.subplots_adjust(wspace=0.3, hspace=0.5)
file_types = ["ciwang_data", "weiwang_data", "tuowang_data"]
lables = ["target==cw", "target==ww", "target==tw"]
colors = ["red", "green", "blue"]
for i, filenames in enumerate(file_types):
df11, df21 = get_speed_and_direction_distribution_data(file_types[i])
plt.subplot(1,2,1)
ax1 = sns.kdeplot(df11["speed"].values / 1000000, color=colors[i],shade=True)
plt.subplot(1,2,2)
ax3 = sns.kdeplot(df21["direction"].values / 1000000, color=colors[i],shade=True)
df_speeds.append(df11)
df_directions.append(df21)
ax1.legend(lables)
ax1.set_xlabel("Speed")
ax3.set_xlabel("Direction")
ax3.legend(lables)
plt.show()
4.3.3 速度与方向数据分位图
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
plt.subplots_adjust(wspace=0.3, hspace=0.5)
colors_box = ['pink', 'lightblue', 'lightgreen']
bplot1 = axes[0].boxplot([df_speeds[0]["speed"].values,df_speeds[1]["speed"].values,df_speeds[2]["speed"].values]
, vert=True
, patch_artist=True
, labels=["cw", "ww", "tw"])
bplot2 = axes[1].boxplot([df_directions[0]["direction"].values, df_directions[1]["direction"].values, df_directions[2]["direction"].values]
, vert=True
, patch_artist=True
, labels=["cw", "ww", "tw"])
for blpot in (bplot1,bplot2):
for patch,color in zip(blpot["boxes"],colors_box):
patch.set_facecolor(color)
axes[0].set_title("speed")
axes[1].set_title("direction")
plt.show()
4.3.4 速度与方向数据热力图
df = pd.DataFrame({'cw':df_speeds[0]["speed"].values[:8]
,'ww':df_speeds[1]["speed"].values[:8]
,'tw':df_speeds[2]["speed"].values[:8]})
sns.heatmap(df);