import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline SEED = 222 np.random.seed(SEED) df = pd.read_csv('input.csv') #切分训练集和测试集 from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score def get_train_test(test_size = 0.95): y = 1 * (df.cand_pty_affiliation == "REP") X = df.drop(["cand_pty_affiliation"], axis = 1) X = pd.get_dummies(X, sparse = True) #对样本的特征进行独热编码“one-hot encoding” X.drop(X.columns[X.std() == 0], axis = 1, inplace = True) #去掉标准差=0 即该特征所有样本都一样的列 return train_test_split(X,y,test_size = test_size, random_state = SEED) xtrain, xtest, ytrain, ytest = get_train_test() print("\nExample data:") df.head()
cand_pty_affiliation:我们要预测的指标,共和党或者民主党 entity_tp:个人还是组织 classification:领域 rpt_tp:贡献的大小 cycle:捐赠在哪年 transaction_amt:捐献金额
df.cand_pty_affiliation.value_counts(normalize = True).plot( kind = "bar", title = "Share of No. donations") plt.show() #这里看一下原始数据正例和负例的比例,这里对应的是民主党和共和党
import pydotplus #导入结构化图形绘制工具 from IPython.display import Image #导入图片显示的库,能够打开图片文件在jupyter中进行显示 from sklearn.metrics import roc_auc_score from sklearn.tree import DecisionTreeClassifier, export_graphviz #导入决策树模型和绘制决策树.dot文件的库 def print_graph(clf, feature_names): "打印决策树" graph &#