python_adult_pca

#! /usr/bin/env python
#coding=utf-8
import pandas as pd
root="F:/Data/exe/3_adult/"#我没等网页全部加载出来就全选复制了,导致数据集未完整!!!!!!!!!!!!!
adult=pd.read_csv(root+"adult.data.txt",header=None)
adult.columns=(["Age","Work-Class","fnlwgt",
                "Education","Education-Num",
                "Matrital-Status","Occupation",
                "Relationship","Race","Sex",
                "Capital-gain","Capital-loss",
                "Hours-per-week","Native-Country","Earnings-Raw"])
#删除包含无效数字的行,inplace=True表示改动当前数据框而不是新建一个
adult.dropna(how="all",inplace=True)
#adult["Work-Class"].unique()#得到所有情况
adult["LongHours"]=adult["Hours-per-week"]>40
#应用了values得到的xy就是矩阵和向量了,就可以按照矩阵和向量的数字法来取了
x=adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values
y=(adult["Earnings-Raw"]==" >50K").values
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
transformer=SelectKBest(score_func=chi2,k=3)
xt_chi2=transformer.fit_transform(x,y)#返回的是xk个特征的列
#print transformer.scores_输出每个特征与y的相关性
#print type(xt_chi2)
#transformer.fit()得到的类型是SelectKBest这个类
#transformer.fit_transform()得到的类型是<type 'numpy.ndarray'>
#xt_chi2后面要带到cross_val_score()函数中,它对x的类型有要求
#pearson相关系数判断相关性
#print x.shape[0]#shape[0]指行数,[1]指列数
from scipy.stats import pearsonr#pearsonr函数
import numpy as np
#pearsonr的参数为两个数组,返回的是相关系数和pdef multivariate_pearsonr(x,y):
    scores,pvalues=[],[]#创建两个数组
    for column in range(x.shape[1]):
        cur_score,cur_p=pearsonr(x[:,column],y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return np.array(scores),np.array(pvalues)
transformer=SelectKBest(score_func=multivariate_pearsonr,k=3)
xt_pearson=transformer.fit_transform(x,y)#返回的是xk个特征的列
#print transformer.scores_
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf=DecisionTreeClassifier(random_state=14)
scores_chi2=cross_val_score(clf,xt_chi2,y,scoring="accuracy")
print "精确度:{}".format(np.mean(scores_chi2))#精确度:0.828598676158
scores_pearson=cross_val_score(clf,xt_pearson,y,scoring="accuracy")
print "精确度:{}".format(np.mean(scores_pearson))#精确度:0.770645941597

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值