python数据挖掘入门与实践----------特征值,主成分分析


#http://archive.ics.uci.edu/ml/machine-learning-databases/adult/
import os
import pandas as pd
adult_filename ="adult.data"

adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education",
                                                        "Education-Num", "Marital-Status", "Occupation",
                                                        "Relationship", "Race", "Sex", "Capital-gain",
                                                        "Capital-loss", "Hours-per-week", "Native-Country",
                                                        "Earnings-Raw"])
adult.dropna(how='all', inplace=True)     # 删除包含无效数字的行

import numpy as np
X = np.arange(30).reshape((10, 3))# 创建一个有10个个体,3个特征的数据集
X[:,1] = 1       #   把第二列的数值改为1

from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold()   #   创建VarianceThreshold转换器
Xt = vt.fit_transform(X)    #   用来删除特征值的方差达不到最低标准的特征

X = adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
y = (adult["Earnings-Raw"] == ' >50K').values

#使用SelectKBest转换器类,用卡方函数打分
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
transformer = SelectKBest(score_func=chi2, k=3)      #   初始化转换器
Xt_chi2 = transformer.fit_transform(X, y)   #   y与X每一列的相关性

#皮尔逊系数
from scipy.stats import pearsonr

def multivariate_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        #只计算该列的皮尔逊相关系数和p值,并将其存储到相应的数组中
        cur_score, cur_p = pearsonr(X[:,column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)   #   y与X每一列的相关性
print(transformer.scores_)

#计算chi2与皮尔逊相关系数的正确率
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(random_state=14)

scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')

print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))

#http://archive.ics.uci.edu/ml/machine-learning-databases/internet_ads/
#创建新特征
import os
import pandas as pd
import numpy as np
data_folder = os.path.join(os.path.expanduser("~"), "Data")
data_filename = os.path.join(data_folder, "Ads", "ad.data")
def convert_number(x):#把字符串转换为数字
    try:
        return float(x)
    except ValueError:
        return np.nan
from collections import defaultdict
converters = defaultdict(convert_number)  #{i: convert_number for i in range(1558)}
converters[1558] = lambda x: 1 if x.strip() == "ad." else 0
    
ads = pd.read_csv(data_filename, header=None, converters=converters)
X = ads.drop(1558, axis=1).values
y = ads[1558]

#决策树
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X, y, scoring='accuracy')
print("The average score is {:.4f}".format(np.mean(scores)))

#主成分分析
from sklearn.decomposition import PCA
pca = PCA(n_components=5)  # 将特征按方差大小排序
Xd = pca.fit_transform(X)
np.set_printoptions(precision=3, suppress=True)
pca.explained_variance_ratio_

clf = DecisionTreeClassifier(random_state=14)
scores_reduced = cross_val_score(clf, Xd, y, scoring='accuracy')
print("The average score from the reduced dataset is {:.4f}".format(np.mean(scores_reduced)))

%matplotlib inline
from matplotlib import pyplot as plt
classes = set(y)
colors = ['red', 'green']
for cur_class, color in zip(classes, colors):
    mask = (y == cur_class).values
    plt.scatter(Xd[mask,0], Xd[mask,1], marker='o', color=color, label=int(cur_class))
plt.legend()

plt.show()

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值