python 特征选择 绘图 + mine

本文通过使用ExtraTreesClassifier进行特征选择,并对数据进行可视化分析,展示了如何从大量特征中挑选出重要的特征,以及如何利用matplotlib进行不同类别数据的可视化。

demo代码:

# _*_coding:UTF-8_*_
import numpy as np
import sys 
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import sys 
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
import os
from minepy import MINE

def iterbrowse(path):
    for home, dirs, files in os.walk(path):
        for filename in files:
            yield os.path.join(home, filename)


def get_data(filename):
    white_verify = []
    with open(filename) as f:
        lines = f.readlines()
        data = {}
        for line in lines:
            a = line.split("\t")
            if len(a) != 78: 
                print(line)
                raise Exception("fuck")
            white_verify.append([float(n) for n in a[3:]])
    return white_verify


if __name__ == '__main__':
    # pdb.set_trace()
    neg_file = "cc_data/black_all.txt"
    pos_file = "cc_data/white_all.txt"
    X = []
    y = []
    if os.path.isfile(pos_file):
        if pos_file.endswith('.txt'):
            pos_set = np.genfromtxt(pos_file)
        elif pos_file.endswith('.npy'):
            pos_set = np.load(pos_file)
        X.extend(pos_set)
        y += [0] * len(pos_set)
    if os.path.isfile(neg_file):
        if neg_file.endswith('.txt'):
            neg_set = np.genfromtxt(neg_file)
        elif neg_file.endswith('.npy'):
            neg_set = np.load(neg_file)

        '''
        X.extend(list(neg_set) * 5)
        y += [1] * (5 * len(neg_set))
        '''
        X.extend(neg_set)
        y += [1] * len(neg_set)

    print("len of X:", len(X))
    print("X sample:", X[:3])
    print("len of y:", len(y))
    print("y sample:", y[:3])
    X = [x[3:] for x in X]
    print("filtered X sample:", X[:3])

    cols = [str(i + 6) for i in range(len(X[0]))]
    clf = ExtraTreesClassifier()
    clf.fit(X, y)
    print (clf.feature_importances_)
    print "Features sorted by their score:"
    print sorted(zip(clf.feature_importances_, cols), reverse=True)

    black_verify = []
    for f in iterbrowse("todo/top"):
        print(f)
        black_verify += get_data(f)
    # ValueError: operands could not be broadcast together with shapes (1,74) (75,) (1,74)
    print(black_verify)
    black_verify_labels = [3] * len(black_verify)

    white_verify = get_data("todo/white_verify.txt")
    print(white_verify)
    white_verify_labels = [2] * len(white_verify)

    unknown_verify = get_data("todo/pek_feature74.txt")
    print(unknown_verify)

    # extend data
    X = np.concatenate((X, black_verify))
    y += black_verify_labels
    X = np.concatenate((X, white_verify))
    y += white_verify_labels

    #################################### plot ####################################
    data_train = pd.DataFrame(X)
    # cols = [str(i) for i in range(6, 81)]
    data_train.columns = cols

    # add label column
    # data_train = data_train.assign(label=pd.Series(y))
    data_train["label"] = pd.Series(y)

    print(data_train.info())
    print(data_train.columns)



    import matplotlib.pyplot as plt

    for col in cols:
        fig = plt.figure(figsize=(20, 16), dpi=8)
        fig.set(alpha=0.2)
        plt.figure()
        data_train[data_train.label == 0.0][col].plot()
        data_train[data_train.label == 1.0][col].plot()
        data_train[data_train.label == 2.0][col].plot()
        data_train[data_train.label == 3.0][col].plot()
        plt.xlabel(u"sample data id")
        plt.ylabel(u"value")
        plt.title(col)
        plt.legend((u'white', u'black', u"white-todo", u"black-todo"), loc='best')
        plt.show()

    print "calculate MINE mic value:"
    for col in cols:
        print col,
        mine = MINE(alpha=0.6, c=15,
                    est="mic_approx")  # http://minepy.readthedocs.io/en/latest/python.html#second-example
        mine.compute_score(data_train[col], y)
        print "MIC=", mine.mic()

    sys.exit(-1)

 extend data 表示待预测的数据

 

关于mic:

from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from minepy import MINE


rs = np.random.RandomState(seed=0)

def mysubplot(x, y, numRows, numCols, plotNum,
              xlim=(-4, 4), ylim=(-4, 4)):

    r = np.around(np.corrcoef(x, y)[0, 1], 1)
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mine.compute_score(x, y)
    mic = np.around(mine.mic(), 1)
    ax = plt.subplot(numRows, numCols, plotNum,
                     xlim=xlim, ylim=ylim)
    ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.plot(x, y, ',')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

def rotation(xy, t):
    return np.dot(xy, [[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])

def mvnormal(n=1000):
    cors = [1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0]
    for i, cor in enumerate(cors):
        cov = [[1, cor],[cor, 1]]
        xy = rs.multivariate_normal([0, 0], cov, n)
        mysubplot(xy[:, 0], xy[:, 1], 3, 7, i+1)

def rotnormal(n=1000):
    ts = [0, np.pi/12, np.pi/6, np.pi/4, np.pi/2-np.pi/6,
          np.pi/2-np.pi/12, np.pi/2]
    cov = [[1, 1],[1, 1]]
    xy = rs.multivariate_normal([0, 0], cov, n)
    for i, t in enumerate(ts):
        xy_r = rotation(xy, t)
        mysubplot(xy_r[:, 0], xy_r[:, 1], 3, 7, i+8)

def others(n=1000):
    x = rs.uniform(-1, 1, n)
    y = 4*(x**2-0.5)**2 + rs.uniform(-1, 1, n)/3
    mysubplot(x, y, 3, 7, 15, (-1, 1), (-1/3, 1+1/3))

    y = rs.uniform(-1, 1, n)
    xy = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), axis=1)
    xy = rotation(xy, -np.pi/8)
    lim = np.sqrt(2+np.sqrt(2)) / np.sqrt(2)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 16, (-lim, lim), (-lim, lim))

    xy = rotation(xy, -np.pi/8)
    lim = np.sqrt(2)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 17, (-lim, lim), (-lim, lim))

    y = 2*x**2 + rs.uniform(-1, 1, n)
    mysubplot(x, y, 3, 7, 18, (-1, 1), (-1, 3))

    y = (x**2 + rs.uniform(0, 0.5, n)) * \
        np.array([-1, 1])[rs.random_integers(0, 1, size=n)]
    mysubplot(x, y, 3, 7, 19, (-1.5, 1.5), (-1.5, 1.5))

    y = np.cos(x * np.pi) + rs.uniform(0, 1/8, n)
    x = np.sin(x * np.pi) + rs.uniform(0, 1/8, n)
    mysubplot(x, y, 3, 7, 20, (-1.5, 1.5), (-1.5, 1.5))

    xy1 = np.random.multivariate_normal([3, 3], [[1, 0], [0, 1]], int(n/4))
    xy2 = np.random.multivariate_normal([-3, 3], [[1, 0], [0, 1]], int(n/4))
    xy3 = np.random.multivariate_normal([-3, -3], [[1, 0], [0, 1]], int(n/4))
    xy4 = np.random.multivariate_normal([3, -3], [[1, 0], [0, 1]], int(n/4))
    xy = np.concatenate((xy1, xy2, xy3, xy4), axis=0)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 21, (-7, 7), (-7, 7))

plt.figure(facecolor='white')
mvnormal(n=800)
rotnormal(n=200)
others(n=800)
plt.tight_layout()
plt.show()
_images/relationships.png

转载于:https://www.cnblogs.com/bonelee/p/9081328.html

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符  | 博主筛选后可见
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值