#数据导入
import os
import pandas as pd
adult_filename="adult.data"
adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education", "Education-Num", "Marital-Status", "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-Country", "Earnings-Raw"])
adult.dropna(how='all',inplace=True)
#adult.columns
#adult.loc[:5]
#输出特征 每周工时的特性(means、min、max...)
#adult["Hours-per-week"].describe()
#输出特征 受教育年限的means
#adult["Education-Num"].median()
#输出特征 工种类别
#adult["Work-Class"].unique()
#生成特征 每周工时是否高于40
adult["LongHours"]=adult["Hours-per-week"]>40
#####################################删除特征方差达不到标准的特征(方差越低,对个体的区分力度越小)
import numpy as np
#生成3*10的矩阵,数据从0-29
X=np.arange(30).reshape((10,3))
#将第二列置1,此时第二列的方差为0
X[:,1]=1
#Xt仅保留了X的第一列、第三列,第二列方差为0被剔除
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold()
Xt = vt.fit_transform(X)
#输出每列的方差
#print(vt.variances_)
####################################单个特征检验
X=adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
#目标类别创建 税前收入是否达到5wdollars
Y=(adult["Earnings-Raw"]=='>50k').values
#创建转换器 卡方评分
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
transformer=SelectKBest(score_func=chi2,k=3)#评分函数为卡方,分类效果较好的特征数量为3
#评分开始
Xt_chi2=transformer.fit_transform(X, Y)
#print(transformer.scores_)
#转换器创建 皮尔逊相关系数评分
from scipy.stats import pearsonr
def multivariate_pearsonr(X, y):
scores, pvalues = [], []
for column in range(X.shape[1]):
cur_score, cur_p = pearsonr(X[:,column], y)
scores.append(abs(cur_score))
pvalues.append(cur_p)
return (np.array(scores), np.array(pvalues))
#评分开始
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, Y)
print(transformer.scores_)
###############################两特征提取方法对比
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, Y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, Y, scoring='accuracy')
#print("The chi2_method accurary is {0:.1f}%".format(100*np.mean(scores_chi2)))
#print("The pearson_method accurary is {0:.1f}%".format(100*np.mean(scores_pearson)))
scikit-learning_特征分析(数据挖掘入门与实践-实验7)
最新推荐文章于 2023-04-09 11:00:00 发布