1、数据源: http://blog.csdn.net/wiking__acm/article/details/50971461
3、代码:
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,make_scorer
train_data = pd.read_csv('D:\\workspace\\kaggle\\data\\zhouzhihua-gua\\train_data.csv')
test_data = pd.read_csv('D:\\workspace\\kaggle\\data\\zhouzhihua-gua\\test_data.csv')
#将数据转化未label(0-N)形式
def encode_features(df_train, df_test):
features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features://
对特征从第一个开始进行转化
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])//
把训练集和测试集组合后的特征集拟合label encoder(‘)这个函数
df_train[feature] = le.transform(df_train[feature])
//
label encoder中有一个transform函数,把汉字的特征转换成数字序号的特赠,然乎把训练集测试集都转换一下
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test//
返回更改之后的特征集
def simplify_interval_info(df)://
再进行连续特征的分类,甜度和密度,寻找阈值进行
bins_density = (0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)//
密度分类,每隔0.1
bins_sugar = (0, 0.1, 0.2, 0.3, 0.4, 0.5)//
甜度分类,每个0.
1
group_name_density = [0, 1, 2, 3, 4, 5, 6, 7]//
然后对密度和甜度进行标号,成为标号序列
group_name_sugar = [0, 1, 2, 3, 4]
category_density = pd.cut(df['密度'], bins_density, labels=group_name_density)//
按我的理解就是,把数据按照分的类和标号按照该特征的名字进行分割;比如密度,0.1对应1.含糖率,0.5对应4
categroy_sugar = pd.cut(df['含糖率'], bins_sugar, labels=group_name_sugar)
df['密度'] = category_density
df['含糖率'] = categroy_sugar
return df//
对于连续特增值的处理完结
train_data, test_data = encode_features(train_data, test_data)//
测试集和训练集特征编码
train_data = simplify_interval_info(train_data)
test_data = simplify_interval_info(test_data)
X_all = train_data.drop(['好瓜'], axis=1)//
测试集把结果那一列去掉,drop
y_all = train_data['好瓜']//
测试集结果是否为好瓜
y_result = [1,0,0]
num_test = 0.50
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=3)//
分割训练集和测试集
# Choose some parameter combinations to try
parameters = {'n_estimators':[5,6,7],
'criterion':['entropy', 'gini']//
信息熵和gini衡量特征信息
}
# Type of scoring used to compare parameter combinations//
然后选择评判分数的评判机制,也就是预测的正确率
。
acc_scorer = make_scorer(accuracy_score)
clf = RandomForestClassifier()//
拟合模型为随机森林
# Run the grid search//
搜索方法,网络搜索方法,把相应的参数传进去
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)//
随机森林模型,用信息熵和gini做判别,分数选择
grid_obj = grid_obj.fit(X_train, y_train)//
再把数据写进去进行拟合模型,建立最优化的模型
# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_//
clf:classfunction,在寻找过程中的GridSerachCV,最好的估计传给clf
clf = clf.fit(X_train, y_train)
test_predictions = clf.predict(X_test)//
为什么还要把训练机的数据再去fit一下啊
print("测试集准确率: %s " % accuracy_score(y_test, test_predictions))//训练集正确率
predictions = clf.predict(test_data)
print("最终准确率: %s " % accuracy_score(y_result, predictions))测试集正确率