智慧海洋---Task2 数据分析阶段

通过数据分析可以发现数据的缺失值,异常值,数据分布情况,并未后续特征工程做准备
1、导入所需模块

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import multiprocessing as mp #多进程操作
import os
import pickle
import random
import read_all_data
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"#用了GPU,加快速度

2、定义加载与存储数据,
简单查了一下:
pickle模块是将对象以文件的形式存放在磁盘上
#pickle.loads()函数用于将二进制对象转换成Python对象
pickle.dump(data, f)#此函数用于将 Python 对象转换成二进制文件
针对pickle的学习,还需要进一步探索,嘻嘻嘻

class Load_save_data():
    def __init__(self, file_name = None):
        self.filename = file_name
    
    def load_data(self, Path = None):
        if Path is None:
            assert self.filename is not None, "Invalid Path...."
        else:
            self.filename = Path
        with open(self.filename, "wb") as f:
            data = pickle.load(f) #pickle.loads()函数用于将二进制对象转换成Python对象
        return data
    
    def save_data(self, data, path):
        if path is None:
            assert self.filename is not None, "Invalid Path...."
        else:
            self.filename = path
        with open(self.filename, 'wb') as f:
            pickle.dump(data, f)#此函数用于将 Python 对象转换成二进制文件,

读取数据并存放

def read_data(Path, Kind = ""):
    #替换数据的存放路径
    filenames = os.listdir(Path)
    print("\n@Read Data From" + Path + "....................")
    with mp.Pool(processes = mp.cpu_count()) as pool:
        data_total = list(tqdm(pool.map(read_all_data.read_train_file if Kind ==
         'train' else read_all_data.read_test_file, filenames), total = 
         len(filenames)))            
    print('\n@End Read total Data.........')
    load_save =  Load_save_data()
    if Kind == 'train':
        load_save.save_data(data_total, "./data_tmp/total_data.pkl")
    return data_total
#训练数据读取
#存放数据的绝对路径
train_path = r"C:\Users\李\Desktop\datawheal\data\hy_round1_train_20200102"
data_train = read_data(train_path, Kind = 'train')
data_train = pd.concat(data_train)
#测试数据读取
#存放数据的绝对路径
test_path = r"C:\Users\李\Desktop\datawheal\data\hy_round1_testA_20200102"
data_test = read_data(test_path, Kind = 'test')
data_test = pd.concat(data_test)

最终形成了.pkl文件
在这里插入图片描述
查看数据的基本情况

#查看数据集的样本个数与原始特征维度
data_test.shape

data_train.shape

data_train.columns

在这里插入图片描述

pd.options.display.max_info_rows = 2699638 ???这里的这句没看太懂
data_train.info() #给出样本数据的相关信息概览:列数,行数,列索引
data_train.describe() #查看各类别的统计特征
data_train.head(3).append(data_train.tail(3))
#查看数据集中的特征缺失值与唯一值等
#查看缺失值
#print()中的f指在字符串内支持大括号内python表达式的计算
print(f'There are {data_train.isnull().any().sum()} columns in train dataset 
with missing values')  #小细节:data.isnull()/data.isnull().any()/data.isnull().any().sum()
#查看训练集与测试集中特征属性只有一值的特征
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 
1] #nunique()返回特征是唯一值的个数
one_value_fea_test = [col for col in data_test.columns if data_test[col].nunique() 
<= 1]
print(f'There are {len(one_value_fea)} columns in train dataset with one unique 
values')
print(f'There are {len(one_value_fea_test)} columns in test dataset with one unique 
values')

小Tips:1、print()中的f指在字符串内支持大括号内python表达式的计算

2、nunique()返回特征是唯一值的个数

3、data.isnull():用来判断缺失值
在这里插入图片描述

/data.isnull().any():判断列的缺失值
在这里插入图片描述

/data.isnull().any().sum()
在这里插入图片描述
渔船轨迹可视化部分:

#渔船轨迹可视化
'''把训练集中的所有数据,根据类别存放到不同的数据文件中'''
def get_diff_data():
    Path = "./data_tmp/total_data.pkl"
    with open(Path, 'rb') as f:
        total_data = pickle.load(f)
    load_save = Load_save_data()
    kind_data = ['刺网', '围网', '拖网']
    file_names = ['ciwang.pkl', 'weiwang.pkl', 'tuowang.pkl']
    for i,datax in enumerate(kind_data):
        data_type = [data for data in total_data if data['type'].unique()[0] == 
        datax] #unique()函数返回参数数组中所有不同的值,并且从小到大排列
        load_save.save_data(data_type, './data_tmp/' + file_names[i])

分三类存储成.pkl文件
在这里插入图片描述
随机选取

#从存放某个轨迹类别的数据文件中,随机读取某个渔船的数据
def get_random_one_traj(type = None):
    np.random.seed(10)
    path = './data_tmp/'
    with open(path + type  + '.pkl', 'rb') as f1:
        data = pickle.load(f1)
    length = len(data)
    index = np.random.choice(length)
    return data[index]
#分别从三个类别的数据文件中,随机读取某三个渔船的数据
def get_random_three_traj(type = None):
    random.seed(10)
    path = './data_tmp/'
    with open(path + type + '.pkl', 'rb') as f:
        data = pickle.load(f)
    data_arrange = np.arange(len(data)).tolist()
    index = random.sample(data_arrange, 3)
    return data[index[0]], data[index[1]], data[index[2]]

刺网,围网,拖网随机选取三个渔船轨迹可视化

#每个类别中随机三个渔船轨迹可视化
def visualize_three_traj():
    fig, axes = plt.subplots(nrows = 3, ncols = 3, figsize = (20, 15))
    plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
    lables = ['ciwang', 'weiwang', 'tuowang']
    for i, file_type in tqdm(enumerate(['ciwang', 'weiwang', 'tuowang'])):
        data1, data2, data3 = get_random_three_traj(type = file_type)
        for j, datax in enumerate([data1, data2, data3]):
            x_data = datax['x'].loc[-1:].values
            y_data = datax['y'].loc[-1:].values
            axes[i][j-1].scatter(x_data[0], y_data[0], label = 'start', c = 'red', 
            s = 10, marker = '8')
            axes[i][j - 1].plot(x_data, y_data, label = lables[i])
            axes[i][j-1].scatter(x_data[len(x_data) - 1], y_data[len(y_data) - 1], 
            label = 'end', c = 'green', s = 10, marker = 'v')
            axes[i][j-1].grid(alpha = 2)
            axes[i][j-1].legend(loc = 'best')
    plt.show()

三种类型的渔船随机轨迹如下所示:

在这里插入图片描述
坐标可视化

#随机选取某条数据,观察x与y的变化情况:判断POI(兴趣点)
def visualize_one_traj_x_y():
    fig, axes = plt.subplots(nrows = 2, ncols = 1, figsize = (10, 8))
    plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
    data1 = get_random_one_traj(type = 'weiwang')
    x = data1['x'].loc[-1:]
    x = x / 10000
    y = data1['y'].loc[-1:]
    y = y/10000
    arr1 = np.arange(len(x))
    arr2 = np.arange(len(y))
    axes[0].plot(arr1, x, label = 'x')
    axes[1].plot(arr2, y, label = 'y')
    axes[0].grid(alpha = 3)
    axes[0].legend(loc = 'best')
    axes[1].grid(alpha = 3)
    axes[1].legend(loc = 'best')
    plt.show()

速度数据在某时段存在同时接近于0的情况,可以判断POI兴趣点
在这里插入图片描述
三类渔船分别随机选取某一个渔船对其进行速度和方向可视化

#每类轨迹,随机选取某个渔船,可视化速度序列与方向序列
def visualize_three_traj_speed_direction():
   fig, axes = plt.subplots(nrows = 3, ncols = 2, figsize = (20, 15))
   plt.subplots_adjust(wspace = 0.3, hspace = 0.3)
   file_types =  ['ciwang', 'weiwang', 'tuowang']
   speed_types = ['ciwang_speed', 'weiwang_speed', 'tuowang_speed']
   doirections = ['ciwang_direction', 'weiwang_direction', 'tuowang_direction']
   colors = ['pink', 'lightblue', 'lightgreen']
   for i, file_name in tqdm(enumerate(file_types)):
     
       datax = get_random_one_traj(type=file_name)
       x_data = datax["速度"].loc[-1:].values
       y_data = datax["方向"].loc[-1:].values
       axes[i][0].plot(range(len(x_data)), x_data, label=speed_types[i], 
       color=colors[i])
       axes[i][0].grid(alpha=2)
       axes[i][0].legend(loc="best")
       axes[i][1].plot(range(len(y_data)), y_data, label=doirections[i], 
       color=colors[i])
       axes[i][1].grid(alpha=2)
       axes[i][1].legend(loc="best")
   plt.show()

结果如下所示
在这里插入图片描述
分析三类渔船速度与方向的数据分布情况

#三类渔船速度方向的数据分布
'''对某一特征进行数据统计,
 type:"ciwang","weiwang" or "tuowang"
 param path:存放数据路径
 param kind: '速度' or '方向'
 param columns:与kind对应,'speed' or 'direction'''
def get_data_cummulation(type = None, path = None, kind = None, columns = None):
    data_dict = dict()
    with open(path + type + '.pkl', 'rb') as file:
        data_list = pickle.load(file)
    for datax in tqdm(data_list):
        data = datax[kind].values
        for speed in data:
            data_dict.setdefault(speed, 0)
            data_dict[speed] +=1
    data_dict = dict(sorted(data_dict.items(), key = lambda x: x[0], reverse = 
    False))
    data_df = pd.DataFrame.from_dict(data_dict,columns = [columns], orient = 
    'index')
    return data_df

Tips:
Python 字典中的setdefault(Keys, [default) :检查dict中是否有Keys键,若没有则添加
在这里插入图片描述
速度与方向的分布数据

#分别得到速度和方向的分布数据
def get_speed_and_direction_distribution_data(type=None):         
    path = './data_tmp/'
    data_speed_df = get_data_cummulation(type = type, path =path, kind = '速度', 
    columns = 'speed')
    data_direction_df = get_data_cummulation(type = type, path =path, kind = '方
    向', columns = 'direction')
    return  data_speed_df , data_direction_df
df_speeds = []
df_directions = []
def plot_speed_direction1_distribution():
    plt.subplots(nrows = 1, ncols = 2, figsize = (15, 6))
    plt.subplots_adjust(wspace = 0.3, hspace = 0.5)
    file_types = ["ciwang", "weiwang", "tuowang"]
    lables = ["target==cw", "target==ww", "target==tw"]
    colors = ["red", "green", "blue"]
    for i, filenames in enumerate(file_types):
       df11, df21 = get_speed_and_direction_distribution_data(file_types[i])
       plt.subplot(1,2,1)
       ax1 = sns.kdeplot(df11["speed"].values / 1000000, 
       color=colors[i],shade=True)
       plt.subplot(1,2,2)
       
       ax3 = sns.kdeplot(df21["direction"].values / 1000000, 
       color=colors[i],shade=True)
       df_speeds.append(df11)
       df_directions.append(df21)
    ax1.legend(lables)
    ax1.set_xlabel("Speed")
    ax3.set_xlabel("Direction")
    ax3.legend(lables)
    plt.show()

 #使用分位图对速度和方向的数据分布进行可视化
def plot_speed_direction2_distribution():
     fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
     plt.subplots_adjust(wspace=0.3, hspace=0.5)
     colors_box = ['pink', 'lightblue', 'lightgreen']
     bplot1 = axes[0].boxplot([df_speeds[0]["speed"].values, df_speeds[1]
     ["speed"].values, df_speeds[2]['speed'].values], vert = True, patch_artist 
     = True, labels = ["cw", "ww", "tw"])
     bplot2 = axes[1].boxplot([df_directions[0]["direction"].values, 
     df_directions[1]["direction"].values, df_directions[2]["direction"].values]
     , vert=True, patch_artist=True, labels=["cw", "ww", "tw"])
     for blpot in (bplot1,bplot2):
         for patch,color in zip(blpot["boxes"],colors_box):
             patch.set_facecolor(color)
     axes[0].set_title("speed")
     axes[1].set_title("direction")
     plt.show() 

速度数据分布大致是符合正态分布的
在这里插入图片描述
方向数据的差异性不大
在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值