#! /usr/bin/env python #coding=utf-8 import pandas as pd root="F:/Data/exe/3_adult/"#我没等网页全部加载出来就全选复制了,导致数据集未完整!!!!!!!!!!!!! adult=pd.read_csv(root+"adult.data.txt",header=None) adult.columns=(["Age","Work-Class","fnlwgt", "Education","Education-Num", "Matrital-Status","Occupation", "Relationship","Race","Sex", "Capital-gain","Capital-loss", "Hours-per-week","Native-Country","Earnings-Raw"]) #删除包含无效数字的行,inplace=True表示改动当前数据框而不是新建一个 adult.dropna(how="all",inplace=True) #adult["Work-Class"].unique()#得到所有情况 adult["LongHours"]=adult["Hours-per-week"]>40 #应用了values得到的x和y就是矩阵和向量了,就可以按照矩阵和向量的数字法来取了 x=adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values y=(adult["Earnings-Raw"]==" >50K").values from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 transformer=SelectKBest(score_func=chi2,k=3) xt_chi2=transformer.fit_transform(x,y)#返回的是x的k个特征的列 #print transformer.scores_输出每个特征与y的相关性 #print type(xt_chi2) #transformer.fit()得到的类型是SelectKBest这个类 #transformer.fit_transform()得到的类型是<type 'numpy.ndarray'> #xt_chi2后面要带到cross_val_score()函数中,它对x的类型有要求 #用pearson相关系数判断相关性 #print x.shape[0]#shape[0]指行数,[1]指列数 from scipy.stats import pearsonr#pearsonr函数 import numpy as np #pearsonr的参数为两个数组,返回的是相关系数和p值 def multivariate_pearsonr(x,y): scores,pvalues=[],[]#创建两个数组 for column in range(x.shape[1]): cur_score,cur_p=pearsonr(x[:,column],y) scores.append(abs(cur_score)) pvalues.append(cur_p) return np.array(scores),np.array(pvalues) transformer=SelectKBest(score_func=multivariate_pearsonr,k=3) xt_pearson=transformer.fit_transform(x,y)#返回的是x的k个特征的列 #print transformer.scores_ from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_score clf=DecisionTreeClassifier(random_state=14) scores_chi2=cross_val_score(clf,xt_chi2,y,scoring="accuracy") print "精确度:{}".format(np.mean(scores_chi2))#精确度:0.828598676158 scores_pearson=cross_val_score(clf,xt_pearson,y,scoring="accuracy") print "精确度:{}".format(np.mean(scores_pearson))#精确度:0.770645941597
python_adult_pca
最新推荐文章于 2024-05-23 11:16:42 发布