数据总体了解:
读取数据集并了解数据集大小,原始特征维度;
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
data_train = pd.read_csv('train.csv', sep=',')
data_train
通过info熟悉数据类型;
data_train.info()
粗略查看数据集中各特征基本统计量;
data_train.describe()
缺失值和唯一值:
查看数据缺失值情况
以比例的形式查看空缺值占比,由于属性较多,仅显示有缺失值的项。
miss = data_train.isnull().sum()/len(data_train)
miss[miss>0].sort_values(ascending = False)
查看唯一值特征情况
# 查看训练集测试集中特征属性只有一值的特征
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 1]
one_value_fea_test = [col for col in data_test_a.columns if data_test_a[col].nunique() <= 1]
one_value_fea
one_value_fea_test
out:['policyCode']
out:[