通过数据分析可以发现数据的缺失值,异常值,数据分布情况,并未后续特征工程做准备
1、导入所需模块
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import multiprocessing as mp #多进程操作
import os
import pickle
import random
import read_all_data
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"#用了GPU,加快速度
2、定义加载与存储数据,
简单查了一下:
pickle模块是将对象以文件的形式存放在磁盘上
#pickle.loads()函数用于将二进制对象转换成Python对象
pickle.dump(data, f)#此函数用于将 Python 对象转换成二进制文件
针对pickle的学习,还需要进一步探索,嘻嘻嘻
class Load_save_data():
def __init__(self, file_name = None):
self.filename = file_name
def load_data(self, Path = None):
if Path is None:
assert self.filename is not None, "Invalid Path...."
else:
self.filename = Path
with open(self.filename, "wb") as f:
data = pickle.load(f) #pickle.loads()函数用于将二进制对象转换成Python对象
return data
def save_data(self, data, path):
if path is None:
assert self.filename is not None, "Invalid Path...."
else:
self.filename = path
with open(self.filename, 'wb') as f:
pickle.dump(data, f)#此函数用于将 Python 对象转换成二进制文件,
读取数据并存放
def read_data(Path, Kind = ""):
#替换数据的存放路径
filenames = os.listdir(Path)
print("\n@Read Data From" + Path + "....................")
with mp.Pool(processes = mp.cpu_count()) as pool:
data_total = list(tqdm(pool.map(read_all_data.read_train_file if Kind ==
'train' else read_all_data.read_test_file, filenames), total =
len(filenames)))
print('\n@End Read total Data.........')
load_save = Load_save_data()
if Kind == 'train':
load_save.save_data(data_total, "./data_tmp/total_data.pkl")
return data_total
#训练数据读取
#存放数据的绝对路径
train_path = r"C:\Users\李\Desktop\datawheal\data\hy_round1_train_20200102"
data_train = read_data(train_path, Kind = 'train')
data_train = pd.concat(data_train)
#测试数据读取
#存放数据的绝对路径
test_path = r"C:\Users\李\Desktop\datawheal\data\hy_round1_testA_20200102"
data_test = read_data(test_path, Kind = 'test')
data_test = pd.concat(data_test)
最终形成了.pkl文件
查看数据的基本情况
#查看数据集的样本个数与原始特征维度
data_test.shape
data_train.shape
data_train.columns
pd.options.display.max_info_rows = 2699638 ???这里的这句没看太懂
data_train.info() #给出样本数据的相关信息概览:列数,行数,列索引
data_train.describe() #查看各类别的统计特征
data_train.head(3).append(data_train.tail(3))
#查看数据集中的特征缺失值与唯一值等
#查看缺失值
#print()中的f指在字符串内支持大括号内python表达式的计算
print(f'There are {data_train.isnull().any().sum()} columns in train dataset
with missing values') #小细节:data.isnull()/data.isnull().any()/data.isnull().any().sum()
#查看训练集与测试集中特征属性只有一值的特征
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <=
1] #nunique()返回特征是唯一值的个数
one_value_fea_test = [col for col in data_test.columns if data_test[col].nunique()
<= 1]
print(f'There are {len(one_value_fea)} columns in train dataset with one unique
values')
print(f'There are {len(one_value_fea_test)} columns in test dataset with one unique
values')
小Tips:1、print()中的f指在字符串内支持大括号内python表达式的计算
2、nunique()返回特征是唯一值的个数
3、data.isnull():用来判断缺失值
/data.isnull().any():判断列的缺失值
/data.isnull().any().sum()
渔船轨迹可视化部分:
#渔船轨迹可视化
'''把训练集中的所有数据,根据类别存放到不同的数据文件中'''
def get_diff_data():
Path = "./data_tmp/total_data.pkl"
with open(Path, 'rb') as f:
total_data = pickle.load(f)
load_save = Load_save_data()
kind_data = ['刺网', '围网', '拖网']
file_names = ['ciwang.pkl', 'weiwang.pkl', 'tuowang.pkl']
for i,datax in enumerate(kind_data):
data_type = [data for data in total_data if data['type'].unique()[0] ==
datax] #unique()函数返回参数数组中所有不同的值,并且从小到大排列
load_save.save_data(data_type, './data_tmp/' + file_names[i])
分三类存储成.pkl文件
随机选取
#从存放某个轨迹类别的数据文件中,随机读取某个渔船的数据
def get_random_one_traj(type = None):
np.random.seed(10)
path = './data_tmp/'
with open(path + type + '.pkl', 'rb') as f1:
data = pickle.load(f1)
length = len(data)
index = np.random.choice(length)
return data[index]
#分别从三个类别的数据文件中,随机读取某三个渔船的数据
def get_random_three_traj(type = None):
random.seed(10)
path = './data_tmp/'
with open(path + type + '.pkl', 'rb') as f:
data = pickle.load(f)
data_arrange = np.arange(len(data)).tolist()
index = random.sample(data_arrange, 3)
return data[index[0]], data[index[1]], data[index[2]]
刺网,围网,拖网随机选取三个渔船轨迹可视化
#每个类别中随机三个渔船轨迹可视化
def visualize_three_traj():
fig, axes = plt.subplots(nrows = 3, ncols = 3, figsize = (20, 15))
plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
lables = ['ciwang', 'weiwang', 'tuowang']
for i, file_type in tqdm(enumerate(['ciwang', 'weiwang', 'tuowang'])):
data1, data2, data3 = get_random_three_traj(type = file_type)
for j, datax in enumerate([data1, data2, data3]):
x_data = datax['x'].loc[-1:].values
y_data = datax['y'].loc[-1:].values
axes[i][j-1].scatter(x_data[0], y_data[0], label = 'start', c = 'red',
s = 10, marker = '8')
axes[i][j - 1].plot(x_data, y_data, label = lables[i])
axes[i][j-1].scatter(x_data[len(x_data) - 1], y_data[len(y_data) - 1],
label = 'end', c = 'green', s = 10, marker = 'v')
axes[i][j-1].grid(alpha = 2)
axes[i][j-1].legend(loc = 'best')
plt.show()
三种类型的渔船随机轨迹如下所示:
坐标可视化
#随机选取某条数据,观察x与y的变化情况:判断POI(兴趣点)
def visualize_one_traj_x_y():
fig, axes = plt.subplots(nrows = 2, ncols = 1, figsize = (10, 8))
plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
data1 = get_random_one_traj(type = 'weiwang')
x = data1['x'].loc[-1:]
x = x / 10000
y = data1['y'].loc[-1:]
y = y/10000
arr1 = np.arange(len(x))
arr2 = np.arange(len(y))
axes[0].plot(arr1, x, label = 'x')
axes[1].plot(arr2, y, label = 'y')
axes[0].grid(alpha = 3)
axes[0].legend(loc = 'best')
axes[1].grid(alpha = 3)
axes[1].legend(loc = 'best')
plt.show()
速度数据在某时段存在同时接近于0的情况,可以判断POI兴趣点
三类渔船分别随机选取某一个渔船对其进行速度和方向可视化
#每类轨迹,随机选取某个渔船,可视化速度序列与方向序列
def visualize_three_traj_speed_direction():
fig, axes = plt.subplots(nrows = 3, ncols = 2, figsize = (20, 15))
plt.subplots_adjust(wspace = 0.3, hspace = 0.3)
file_types = ['ciwang', 'weiwang', 'tuowang']
speed_types = ['ciwang_speed', 'weiwang_speed', 'tuowang_speed']
doirections = ['ciwang_direction', 'weiwang_direction', 'tuowang_direction']
colors = ['pink', 'lightblue', 'lightgreen']
for i, file_name in tqdm(enumerate(file_types)):
datax = get_random_one_traj(type=file_name)
x_data = datax["速度"].loc[-1:].values
y_data = datax["方向"].loc[-1:].values
axes[i][0].plot(range(len(x_data)), x_data, label=speed_types[i],
color=colors[i])
axes[i][0].grid(alpha=2)
axes[i][0].legend(loc="best")
axes[i][1].plot(range(len(y_data)), y_data, label=doirections[i],
color=colors[i])
axes[i][1].grid(alpha=2)
axes[i][1].legend(loc="best")
plt.show()
结果如下所示
分析三类渔船速度与方向的数据分布情况
#三类渔船速度方向的数据分布
'''对某一特征进行数据统计,
type:"ciwang","weiwang" or "tuowang"
param path:存放数据路径
param kind: '速度' or '方向'
param columns:与kind对应,'speed' or 'direction'''
def get_data_cummulation(type = None, path = None, kind = None, columns = None):
data_dict = dict()
with open(path + type + '.pkl', 'rb') as file:
data_list = pickle.load(file)
for datax in tqdm(data_list):
data = datax[kind].values
for speed in data:
data_dict.setdefault(speed, 0)
data_dict[speed] +=1
data_dict = dict(sorted(data_dict.items(), key = lambda x: x[0], reverse =
False))
data_df = pd.DataFrame.from_dict(data_dict,columns = [columns], orient =
'index')
return data_df
Tips:
Python 字典中的setdefault(Keys, [default) :检查dict中是否有Keys键,若没有则添加
速度与方向的分布数据
#分别得到速度和方向的分布数据
def get_speed_and_direction_distribution_data(type=None):
path = './data_tmp/'
data_speed_df = get_data_cummulation(type = type, path =path, kind = '速度',
columns = 'speed')
data_direction_df = get_data_cummulation(type = type, path =path, kind = '方
向', columns = 'direction')
return data_speed_df , data_direction_df
df_speeds = []
df_directions = []
def plot_speed_direction1_distribution():
plt.subplots(nrows = 1, ncols = 2, figsize = (15, 6))
plt.subplots_adjust(wspace = 0.3, hspace = 0.5)
file_types = ["ciwang", "weiwang", "tuowang"]
lables = ["target==cw", "target==ww", "target==tw"]
colors = ["red", "green", "blue"]
for i, filenames in enumerate(file_types):
df11, df21 = get_speed_and_direction_distribution_data(file_types[i])
plt.subplot(1,2,1)
ax1 = sns.kdeplot(df11["speed"].values / 1000000,
color=colors[i],shade=True)
plt.subplot(1,2,2)
ax3 = sns.kdeplot(df21["direction"].values / 1000000,
color=colors[i],shade=True)
df_speeds.append(df11)
df_directions.append(df21)
ax1.legend(lables)
ax1.set_xlabel("Speed")
ax3.set_xlabel("Direction")
ax3.legend(lables)
plt.show()
#使用分位图对速度和方向的数据分布进行可视化
def plot_speed_direction2_distribution():
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
plt.subplots_adjust(wspace=0.3, hspace=0.5)
colors_box = ['pink', 'lightblue', 'lightgreen']
bplot1 = axes[0].boxplot([df_speeds[0]["speed"].values, df_speeds[1]
["speed"].values, df_speeds[2]['speed'].values], vert = True, patch_artist
= True, labels = ["cw", "ww", "tw"])
bplot2 = axes[1].boxplot([df_directions[0]["direction"].values,
df_directions[1]["direction"].values, df_directions[2]["direction"].values]
, vert=True, patch_artist=True, labels=["cw", "ww", "tw"])
for blpot in (bplot1,bplot2):
for patch,color in zip(blpot["boxes"],colors_box):
patch.set_facecolor(color)
axes[0].set_title("speed")
axes[1].set_title("direction")
plt.show()
速度数据分布大致是符合正态分布的
方向数据的差异性不大