对癌症数据进行特征工程,提升lr模型的准确率
头文件:
import pandas as pd
# 众数
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
1、加载数据
# 1.加载数据
cancer_data=load_breast_cancer()
# 特征数据
feature_data=cancer_data.data
print(type(feature_data))
# 标签数据
label_data=cancer_data['target']
可知数据为numpy.ndarray类型
转化为dataframe类型,添加列名
# 2. 空值(缺失值)处理
#转化为dataframe类型,添加列名
df=pd.DataFrame(feature_data,columns=cancer_data['feature_names'])
print(df.head(),type(df))
2、缺失值处理
(1)查看是否有空值
print(df.isnull().sum