python-提取特征 & 特征选择

1. dataset :archive.ics.uci.edu/ml/datasets/Adult

2. 讲解 & 代码

#    dataset :archive.ics.uci.edu/ml/datasets/Adult

import os
import pandas as pd
data_folder = os.path.join(os.getcwd(),"Data/adult.data")

adult = pd.read_csv(data_folder,header=None,names=["Age","Work-Class","fnlwgt","Education","Education-Num","Marital-Status","Occupation","Relationship","Race","Sex","Capital-gain","Capital-loss","Hours-per-week","Native-country","Earnings-Raw"])
adult.dropna(how='all',inplace=True)
# print(adult.columns)

# print(adult["Hours-per-week"].describe())

# print(adult["Work-Class"].unique())

#   创建特征
adult["LongHours"] = adult["Hours-per-week"]>40

# print(adult.columns)

x = adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values

y = (adult["Earnings-Raw"]==' >50K').values
print(y)

from sklearn.feature_selection import SelectKBest,chi2
transformer =SelectKBest(score_func=chi2,k=3)
Xt_chi2 = transformer.fit_transform(x,y)
print(transformer.scores_)

from scipy.stats import pearsonr
import numpy as np
def multivariate_pearsonr(x,y):
    scores, pvalues = [],[]
    for column in range(x.shape[1]):
        cur_score,cur_p = pearsonr(x[:,column],y)
        scores.append(cur_score)
        pvalues.append(cur_p)
    return (np.array(scores),np.array(pvalues))

transformer = SelectKBest(score_func=multivariate_pearsonr,k=3)
Xt_pearsonr = transformer.fit_transform(x,y)
print(transformer.scores_)
#
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

clf = DecisionTreeClassifier(random_state=14)
score_chi2 = cross_val_score(clf,Xt_chi2,y,scoring='accuracy')
score_pearsonr = cross_val_score(clf,Xt_pearsonr,y,scoring='accuracy')
print("{0:.3f},{0:.3f}".format(np.mean(score_chi2*100),np.mean(score_pearsonr*100)))

3. dataset: https://archive.ics.uci.edu/ml/datasets/Internet+Advertisements

4. 代码 & 讲解

import os
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

data_file = os.path.join(os.getcwd(),"Data/ad.data")

def convert_number(x):
    try:
        return float(x)
    except ValueError:
        return np.nan

converters = defaultdict(convert_number)
converters[1558] = lambda x:1 if x.strip() == "ad." else 0

#问号处理
for i in range(0,1558):
    converters[i]=lambda x: np.nan if x.strip() == "?" else x

ads = pd.read_csv(data_file, header=None, converters=converters, low_memory=False)


print(ads[:5])
ads.dropna(inplace=True)
X = ads.drop(1558,axis=1).values
y = ads[1558]

from sklearn.decomposition import PCA
pca = PCA(n_components=5)
Xd = pca.fit_transform(X)

np.set_printoptions(precision=5,suppress=True)
print(pca.explained_variance_ratio_)

clf = DecisionTreeClassifier(random_state=14)
scores_reduce = cross_val_score(clf,Xd,y,scoring='accuracy')
print(np.mean(scores_reduce*100))

from matplotlib import pyplot as plt
classes = set(y)
colors = ['red','green']
for cur_class ,color in zip(classes,colors):
    mask = (y==cur_class).values
    plt.scatter(Xd[mask,0],Xd[mask,1],marker='o',color=color,label=int(cur_class))
plt.legend()
plt.show()
  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

脑电信号要分类

你的鼓励是我创作的前进动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值