1 from sklearn.multioutput import MultiOutputClassifier 2 from sklearn.ensemble import RandomForestClassifier 3 from sklearn.ensemble import ExtraTreesClassifier 4 import numpy as np 5 from pandas import read_csv 6 import pandas as pd 7 8 root1="F:/goverment/shuili2/techproblem_text_train.csv" 9 root2="F:/goverment/shuili2/techproblem_text_test.csv" 10 root3="F:/goverment/shuili2/text_train_4problem.csv" 11 root4="F:/goverment/shuili2/text_test_4problem.csv" 12 13 14 '''大类小类一起预测''' 15 #root2="./id="+str(id_num)+"_process_data.csv" 16 dataset1 = read_csv(root1) #数据转化为数组 17 dataset1=dataset1.values 18 dataset2 = read_csv(root2) #数据转化为数组 19 dataset2=dataset2.values 20 X_train=dataset1[:,:28]# 到28之前都是变量 21 Y_train=dataset1[:,28:]# 28到之后都是lable 22 X_test=dataset2[:,:28] 23 Y_test=dataset2[:,28:] 24 25 print('多输出多分类器真实输出分类:\n',Y_train) 26 n_samples, n_features = X_train.shape #4000 29 27 n_outputs = Y_train.shape[1] # 4000*8 28 n_classes = 50 # 每种输出有50种分类 29 forest = RandomForestClassifier(n_estimators=500,random_state=1) # 生成随机森林多分类器 30 multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器 31 y_pred = multi_target_forest.fit(X_train, Y_train).predict(X_train) 32 print('多输出多分类器预测输出分类:\n',y_pred) 33 pp=multi_target_forest.predict(X_test) 34 a=pp 35 k=0 36 for i in range(len(a)): 37 if a[i][0]==Y_test[i][0] and a[i][1]==Y_test[i][1] and a[i][2]==Y_test[i][2] and a[i][3]==Y_test[i][3] and a[i][4]==Y_test[i][4] and a[i][5]==Y_test[i][5] and a[i][6]==Y_test[i][6] and a[i][7]==Y_test[i][7]: 38 k+=1 39 aa=k/1328*1.0 40 print(aa) 41 42 43 '''只预测大类''' 44 #root2="./id="+str(id_num)+"_process_data.csv" 45 dataset3 = read_csv(root1) #数据转化为数组 46 dataset3=dataset3.values 47 dataset4 = read_csv(root2) #数据转化为数组 48 dataset4=dataset4.values 49 X_train_big=dataset3[:,:28] 50 Y_train_big=dataset3[:,28:32] 51 X_test_big=dataset4[:,:28] 52 Y_test_big=dataset4[:,28:32] 53 print('只预测大类:多输出多分类器真实输出分类:\n',Y_train_big) 54 n_samples, n_features = X_train_big.shape #4000 29 55 n_outputs = Y_train_big.shape[1] # 4000*8 56 n_classes = 11 # 每种输出有11种分类 57 forest = RandomForestClassifier(n_estimators=200,random_state=1) # 生成随机森林多分类器 58 multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器 59 y_pred = multi_target_forest.fit(X_train_big, Y_train_big).predict(X_train_big) 60 print('多输出多分类器预测输出分类:\n',y_pred) 61 pp=multi_target_forest.predict(X_test_big) 62 a=pp 63 k=0 64 for i in range(len(a)): 65 if a[i][0]==Y_test_big[i][0] and a[i][1]==Y_test_big[i][1] and a[i][2]==Y_test_big[i][2] and a[i][3]==Y_test_big[i][3]: 66 k+=1 67 aa=k/1328*1.0 68 print(aa) 69 70 71 '''只预测小类''' 72 #root2="./id="+str(id_num)+"_process_data.csv" 73 dataset4 = read_csv(root3) #数据转化为数组 74 dataset4=dataset4.values 75 dataset5 = read_csv(root4) #数据转化为数组 76 dataset5=dataset5.values 77 X_train_samll=dataset4[:,:28] 78 Y_train_samll=dataset4[:,28:32] 79 X_test_samll=dataset5[:,:28] 80 Y_test_samll=dataset5[:,28:32] 81 print('只预测小类:多输出多分类器真实输出分类:\n',Y_train_samll) 82 n_samples, n_features = X_train_samll.shape #4000 29 83 n_outputs = Y_train_samll.shape[1] # 4000*4 84 n_classes = 61 # 每种输出有61种分类 85 forest = RandomForestClassifier(n_estimators=200,random_state=1) # 生成随机森林多分类器 86 multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器 87 y_pred = multi_target_forest.fit(X_train_samll, Y_train_samll).predict(X_train_samll) 88 print('多输出多分类器预测输出分类:\n',y_pred) 89 pp=multi_target_forest.predict(X_test_samll) 90 a=pp 91 k=0 92 for i in range(len(a)): 93 if a[i][0]==Y_test_samll[i][0] and a[i][1]==Y_test_samll[i][1] and a[i][2]==Y_test_samll[i][2] and a[i][3]==Y_test_samll[i][3]: 94 k+=1 95 aa=k/1328*1.0 96 print(aa) 97 98 99 100 ''' 101 from pandas import read_csv 102 import pandas as pd 103 import numpy as np 104 from skmultilearn.problem_transform import BinaryRelevance 105 from sklearn.naive_bayes import GaussianNB 106 from sklearn.metrics import accuracy_score 107 108 109 root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/shuili2/data.csv" 110 #root2="./id="+str(id_num)+"_process_data.csv" 111 dataset = read_csv(root1) #数据转化为数组 112 dataset=dataset.values 113 x_train=dataset[:4000,:29] 114 y_train=dataset[:4000,29:] 115 116 x_test=dataset[4000:,:29] 117 y_test=dataset[4000:,29:] 118 119 # initialize binary relevance multi-label classifier 120 # with a gaussian naive bayes base classifier 121 classifier = BinaryRelevance(GaussianNB()) 122 123 # train 124 classifier.fit(x_train, y_train) 125 126 # predict 127 predictions = classifier.predict(x_test) 128 accuracy_score(y_test,predictions) 129 ''' 130 131 132 '''---------------------------------''' 133 ''' 134 import numpy as np 135 import pandas as pd 136 from keras.models import Sequential 137 from keras.layers import Dense, Dropout 138 from keras.wrappers.scikit_learn import KerasClassifier 139 from keras.utils import np_utils 140 from sklearn.model_selection import train_test_split, KFold, cross_val_score 141 from sklearn.preprocessing import LabelEncoder 142 from pandas import read_csv 143 from sklearn.naive_bayes import GaussianNB 144 from sklearn.metrics import accuracy_score 145 146 147 root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/shuili2/data.csv" 148 #root2="./id="+str(id_num)+"_process_data.csv" 149 dataset = read_csv(root1) #数据转化为数组 150 dataset=dataset.values 151 152 # load dataset 153 dataframe = pd.read_csv("data.csv", header=None) 154 dataset = dataframe.values 155 X = dataset[:, 0:29].astype(float) 156 Y = dataset[:, 29:] 157 158 # encode class values as integers 159 #encoder = LabelEncoder() 160 #encoded_Y = encoder.fit_transform(Y) 161 # convert integers to dummy variables (one hot encoding) 162 #dummy_y = np_utils.to_categorical(encoded_Y) 163 164 # define model structure 165 def baseline_model(): 166 model = Sequential() 167 model.add(Dense(output_dim=10, input_dim=29, activation='relu')) 168 model.add(Dropout(0.2)) 169 model.add(Dense(output_dim=8, input_dim=10, activation='softmax')) 170 # Compile model 171 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 172 return model 173 estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=200, batch_size=50) 174 # splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed. 175 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01, random_state=0) 176 estimator.fit(X_train, Y_train) 177 178 # make predictions 179 pred = estimator.predict(X_test) 180 181 182 # inverse numeric variables to initial categorical labels 183 #init_lables = encoder.inverse_transform(pred) 184 185 # k-fold cross-validate 186 seed = 42 187 np.random.seed(seed) 188 kfold = KFold(n_splits=10, shuffle=True, random_state=seed) 189 results = cross_val_score(estimator, X, Y, cv=kfold) 190 '''
from pandas import read_csv root1="F:/goverment/shuili2/techproblem_text_train.csv" root2="F:/goverment/shuili2/techproblem_text_test.csv" root3="F:/goverment/shuili2/text_train_4problem.csv" root4="F:/goverment/shuili2/text_test_4problem.csv" '''大类小类一起预测''' #root2="./id="+str(id_num)+"_process_data.csv" dataset1 = read_csv(root1) #数据转化为数组 dataset1=dataset1.values dataset2 = read_csv(root2) #数据转化为数组 dataset2=dataset2.values X_train=dataset1[:,:28] Y_train=dataset1[:,28:] X_test=dataset2[:,:28] Y_test=dataset2[:,28:] from pprint import pprint pprint(dataset1) ##使用二进制相关性 #scikit-multilearn from skmultilearn.problem_transform import BinaryRelevance from sklearn.naive_bayes import GaussianNB #initialize二进制相关多标签分类器 #用高斯朴素贝叶斯基分类器 classifier = BinaryRelevance(GaussianNB()) #训练 classifier.fit(X_train, Y_train) #预测 predictions = classifier.predict(X_test) #计算精度用 from sklearn.metrics import accuracy_score accuracy_score(Y_test,predictions)