import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.neighbors import KNeighborsClassifier
dataset = pd.read_csv(r"C:\Users\hjz\AI\project\02_lianxi\01_DigitRecognizer\0_data\train.csv")
print("1.训练集大小:",dataset.shape)
x_train = dataset.values[0:,1:]
y_train = dataset.values[0:,0]
x_test = pd.read_csv(r"C:\Users\hjz\AI\project\02_lianxi\01_DigitRecognizer\0_data\test.csv").values
from sklearn.decomposition import PCA
start = time.time()
pca_model = PCA()
pca_model.fit(x_train)
information_list = pca_model.explained_variance_ratio_
score = 0
num = 0
for i in information_list:
score += i
num += 1
if score >= 0.95:
print("2.降维num:",num)
break
pca_model =PCA(num)
pca_model.fit(x_train)
x_train = pca_model.transform(x_train)
print("3.降维后训练集大小:",x_train.shape)
x_test = pca_model.transform(x_test)
print("4.降维后测试集大小:",x_test.shape)
end = time.time()
print("5.PCA耗时%.2f秒"%(end-start))
x_train = (x_train - x_train.min())/(x_train.max()-x_train.min())
x_test = (x_test - x_test.min())/(x_test.max()-x_test.min())
print("开始进行KNN训练。。。")
knn_clf = KNeighborsClassifier(n_neighbors=4,algorithm='kd_tree',weights='distance')
start = time.time()
knn_clf.fit(x_train,y_train)
result = knn_clf.predict(x_test)
result = np.c_[range(1,len(result)+1),result.astype(int)]
df_result = pd.DataFrame(result,columns=['ImageID','Label'])
df_result.to_csv('../results.knn.csv',index=False)
end = time.time()
print("6.KNN耗时%.2f秒"%(end-start))
print("开始进行LR训练。。。")
from sklearn.linear_model import LogisticRegression
start = time.time()
lr_clf = LogisticRegression(penalty='l1',C=0.2)
lr_clf.fit(x_train.astype("float"),y_train)
result = lr_clf.predict(x_test)
result = np.c_[range(1,len(result)+1),result.astype(int)]
df_result = pd.DataFrame(result,columns=["ImageId","Label"])
df_result.to_csv("../results_lr.csv",index=False)
end = time.time()
print("7.LR耗时%.2f秒"%(end-start))
from sklearn.ensemble import RandomForestClassifier
print("开始进行RF训练。。。")
start = time.time()
rf_clf = RandomForestClassifier(n_estimators=400,n_jobs=4,verbose=1,criterion="gini",max_features=10)
rf_clf.fit(x_train.astype("float"),y_train)
result = rf_clf.predict(x_test)
result = np.c_[range(1,len(result)+1),result.astype(int)]
df_result = pd.DataFrame(result,columns=['ImageId','Label'])
df_result.to_csv('../results_RF.csv',index=False)
end = time.time()
print("8.RF耗时%.2f秒"%(end-start))
print("开始进行SVM训练。。。")
from sklearn.svm import SVC
start = time.time()
svc_clf = SVC(C=0.5,kernel='rbf',verbose=False,gamma=0.025)
svc_clf.fit(x_train.astype('float'),y_train)
result = svc_clf.predict(x_test)
result = np.c_[range(1,len(result)+1),result.astype(int)]
df_result = pd.DataFrame(result,columns=['ImageId','Label'])
df_result.to_csv('../results_SVM.csv',index=False)
end = time.time()
print("9.SVM耗时%.2f秒"%(end-start))
C:\ProgramData\Anaconda2\envs\py36\python.exe C:/Users/hjz/AI/project/02_lianxi/01_DigitRecognizer/01_ML/DigitRecognizer_ML.py
1.训练集大小: (42000, 785)
2.降维num: 154
3.降维后训练集大小: (42000, 154)
4.降维后测试集大小: (28000, 154)
5.PCA耗时6.79秒
开始进行KNN训练。。。
6.KNN耗时256.17秒
开始进行LR训练。。。
7.LR耗时42.33秒
开始进行RF训练。。。
[Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 5.8s
[Parallel(n_jobs=4)]: Done 192 tasks | elapsed: 25.6s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed: 53.3s finished
[Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 0.0s
[Parallel(n_jobs=4)]: Done 192 tasks | elapsed: 0.5s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed: 1.2s finished
8.RF耗时55.12秒
开始进行SVM训练。。。
9.SVM耗时534.41秒
Process finished with exit code 0
import xgboost as xgb
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
def createDataSet():
trainDataSet = pd.read_csv('../0_data/train.csv')
testDataSet = pd.read_csv('../0_data/test.csv')
trainDataSet = np.array(trainDataSet)
testDataSet = np.array(testDataSet)
trainData = trainDataSet[:, 1:len(trainDataSet)]
trainLabels = trainDataSet[:, 0]
testData = testDataSet
return trainData, trainLabels, testData
def getPredict(datas, labels):
x_train, x_test, y_train, y_test = train_test_split(datas, labels, test_size = 0.1)
param = {
'booster':'gbtree',
'objective': 'multi:softmax',
'num_class':10,
'gamma':0.1,
'max_depth':12,
'lambda':2,
'subsample':0.8,
'colsample_bytree':0.7,
'min_child_weight':5,
'silent':False,
'learning_rate': 0.05,
'seed':1000
}
xgb_train = xgb.DMatrix(data = x_train, label = y_train)
xgb_val = xgb.DMatrix(data = x_test, label = y_test)
xgb_test = xgb.DMatrix(x_test)
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')]
model = xgb.train(params = param,
dtrain = xgb_train,
num_boost_round = 5000,
evals = watchlist,
early_stopping_rounds=100
)
print('best best_ntree_limit:', model.best_ntree_limit)
model.save_model('1.model')
y_pred = model.predict(xgb_test)
print(accuracy_score(y_test, y_pred))
trainData, trainLabels, testData = createDataSet()
getPredict(trainData, trainLabels)
xgbPredict = xgb.DMatrix(testData)
model = xgb.Booster()
model.load_model('1.model')
y_pred = model.predict(xgbPredict)
print(y_pred)
f = open('submission_xgb.csv', 'w', encoding = 'utf-8')
f.write('ImageId,Label\n')
for i in range(len(y_pred)):
f.write(str(i + 1) + ',' + str(int(y_pred[i])) + '\n')
f.close()
model | 成绩 | 耗时 |
---|
KNN | 0.97282 | 258.80秒 |
LR | 0.88603 | 44.04秒 |
RF-pca | 0.94782 | 55.12秒 |
SVM | 0.88375 | 534.41秒 |
XGBoost | 0.97282 | ~2000秒 |
RF-only | 0.96796 | ~50秒 |