通过数据分析可以发现数据的缺失值,异常值,数据分布情况,并未后续特征工程做准备
1、导入所需模块
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import multiprocessing as mp #多进程操作
import os
import pickle
import random
import read_all_data
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"#用了GPU,加快速度
2、定义加载与存储数据,
简单查了一下:
pickle模块是将对象以文件的形式存放在磁盘上
#pickle.loads()函数用于将二进制对象转换成Python对象
pickle.dump(data, f)#此函数用于将 Python 对象转换成二进制文件
针对pickle的学习,还需要进一步探索,嘻嘻嘻
class Load_save_data():
def __init__(self, file_name = None):
self.filename = file_name
def load_data(self, Path = None):
if Path is None:
assert self.filename is not None, "Invalid Path...."
else:
self.filename = Path
with open(self.filename, "wb") as f:
data = pickle.load(f) #pickle.loads()函数用于将二进制对象转换成Python对象
return data
def save_data(self, data, path):
if path is None:
assert self.filename is not None, "Invalid Path...."
else:
self.filename = path
with open(self.filename, 'wb') as f:
pickle.dump(data, f)#此函数用于将 Python 对象转换成二进制文件,
读取数据并存放
def read_data(Path, Kind = ""):
#替换数据的存放路径
filenames = os.listdir(Path)
print("\n@Read Data From" + Path + "....................")
with mp.Pool(processes = mp.cpu_count()) as pool:
data_total = list(tqdm(pool.map(read_all_data.read_train_file if Kind ==
'train' else read_all_data.read_test_file, filenames), total =
len(filenames)))
print('\n@End Read total Data.........')
load_save = Load_save_data()
if Kind == 'train':
load_save.save_data(data_total, "./data_tmp/total_data.pkl")
return data_total
#训练数据读取
#存放数据的绝对路径
train_path = r"C:\Users\李\Desktop\datawheal\data\hy_round1_train_20200102"
data_train = read_data(train_path, Kind = 'train')
data_train = pd.concat(data_train)
#测试数据读取
#存放数据的绝对路径
test_path = r"C:\Users\李\Desktop\datawheal\data\hy_round1_testA_20200102"
data_test = read_data(test_path, Kind = 'test')
data_test = pd.concat(data_test)
最终形成了.pkl文件
查看数据的基本情况
#查看数据集的样本个数与原始特征维度
data_test.shape
data_train.shape
data_train.columns
pd.options.display.max_info_rows = 2699638 ???这里的这句没看太懂
data_train.info() #给出样本数据的相关信息概览:列数,行数,列索引
data_train.describe() #查看各类别的统计特征
data_train.head(3).append(data_train.tail(3))
#查看数据集中的特征缺失值与唯一值等
#查看缺失值
#print()中的f指在字符串内支持大括号内python表达式的计算
print(f'There are {
data_train.isnull().any().sum()} columns in train dataset
with missing values') #小细节:data.isnull()/data.isnull().any()/data.isnull().any().sum()
#查看训练集与测试集中特征属性只有一值的特征
one_value_fea = [col for col