智慧海洋建设——Task2 数据分析
一,数据集中特征缺失值、唯一值:
数据收集过程中往往会有各种原因造成数据的缺失如传感器损坏等,数据的确实往往会造成数据处理的一些麻烦,而数据往往都是昂贵的,因为部分数据的缺失而丢弃整个数据集也往往不可取,所以我们需要事先了解缺失的数据,用一些手段取代这些缺失的数据。这篇知乎文章较为详细的介绍了对于数据缺失的一些处理方法。
下面是本项目中具体的实施代码:
#引入所需要的库
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
#引入所需要的库
from tqdm import tqdm
import multiprocessing as mp
import os
import pickle
import random
# 把读取所有数据的函数放在单独的python文件中,是为了解决多线程问题在jupyter notebook无法运行的问题
import read_all_data #这是作者事先写好的一段代码,可以打开文档,使用时需要根据自己文件位置修改代码
下面附上这段代码
# -*- codeing = utf-8 -*-
# @Time : 2021/3/7 22:23
# @Author : Evan_wyl
# @File : read_all_data.py
'''import pandas as pd
def read_train_file(filename=None):
# 替换数据存放的路径
# Path = "D:/code_sea/data/train/hy_round1_train_20200102/" 修改成自己的
Path = "C:/Users/suqil/Desktop/智慧海洋/数据集/hy_round1_train_20200102/"
return pd.read_csv(Path + filename,encoding="utf-8")
def read_test_file(filename=None):
# 替换数据存放的路径
# 修改成自己的
Path = "C:/Users/suqil/Desktop/智慧海洋/数据集/hy_round1_testA_20200102/"
return pd.read_csv(Path + filename,encoding="utf-8")'''
定义加载和存储数据的函数:
class Load_Save_Data():
def __init__(self,file_name=None):
self.filename = file_name
def load_data(self,Path=None):
if Path is None:
assert self.filename is not None,"Invalid Path...."
else:
self.filename = Path
with open(self.filename,"wb") as f:
data = pickle.load(f)
return data
def save_data(self,data,path):
if path is None:
assert self.filename is not None,"Invalid path...." #判定语句 是否在,返回 True 和
else:
self.filename = path
with open(self.filename,"wb") as f:
pickle.dump(data,f) #保存数据的一个操作
定义读取数据的函数
def read_data(Path,Kind=""):
"""
:param Path:待读取数据的存放路径
:param Kind:'train' of 'test'
"""
# 替换成数据存放的路径
filenames = os.listdir(Path)
print("\n@Read Data From"+Path+".........................")
with mp.Pool(processes=mp.cpu_count()) as pool:
data_total = list(tqdm(pool.map(read_all_data.read_train_file if Kind == "train" else
read_all_data.read_test_file,filenames),total=len(filenames)))
print("\n@End Read total Data............................")
load_save = Load_Save_Data()
if Kind == "train":
load_save.save_data(data_total,"./数据集/total_data.pkl")#此处也要根据自己文件夹位置改变
return data_total
随后导入自己获得的数据:
Path="C:/Users/suqil/Desktop/智慧海洋/数据集/hy_round1_train_20200102/"
Kind="train"
filenames=os.listdir(Path)#获取文件名
filenames#读出文件名
with mp.Pool(processes=mp.cpu_count()) as pool:
data_total = list(tqdm(pool.map(read_all_data.read_train_file if Kind == "train" else
read_all_data.read_test_file,filenames),total=len(filenames)))
read_all_data.read_train_file
分别读取两个文件内的数据
train_path = "C:/Users/33309/Desktop/智慧海洋/数据集/hy_round1_train_20200102/"
data_train = read_data(train_path,Kind="train")
data_train = pd.concat(data_train) #将图片变为一张图
test_path = "C:/Users/33309/Desktop/智慧海洋/数据集/hy_round1_testA_20200102/"
data_test = read_data(test_path,Kind="test")
data_test = pd.concat(data_test)
print("\n@End Read total Data............................")
load_save = Load_Save_Data()
if Kind == "train":
load_save.save_data(data_total,"./数据集/total_data.pkl")与上同理
print(f'There are {data_train.isnull().any().sum()} ') #根据字面理解isnull sum 为空的数量和,也就是我们说的数据特征缺失值
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 1] #根据字面理解,unique独一无二的数小于等于1
one_value_fea_test = [col for col in data_test.columns if data_test[col].nunique() <= 1] #常数量
one_value_fea
one_value_fea_test
#得出是否有唯一值
对于数据集特征缺失值和唯一值分析就结束了