import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
filename = 'data.csv'
raw = pd.read_csv(filename)
print(raw.shape)
raw.head()
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy “”” /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
ss = df.groupby('key1')
for g in ss:
print(g[1])
for g in ss:
print(g)
key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989 key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007 key1 key2 data1 data2 4 c one -0.893631 -0.193283 (‘a’, key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989) (‘b’, key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007) (‘c’, key1 key2 data1 data2 4 c one -0.893631 -0.193283)
df['data2'].groupby(df['key1']).sum()
key1 a -2.167849 b -1.954537 Name: data2, dtype: float64
import matplotlib.cm as cm
plt.figure(figsize=(20,10))
defscatter_plot_by_category(feat):
appha = 0.1
gs = kebo.groupby(feat)
cs = cm.rainbow(np.linspace(0,1,len(gs)))
for g,c in zip(gs,cs):
plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')
plt.subplot(132)df['data1'].groupby(df['key1']).sum()
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')
from sklearn.cross_validation import KFold
print('find best n_estimators for RandomForestClassfier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:
print('the number of trees: {0}'.format(n))
t1= time.time()
rfc_score = 0.#模型是什么
rfc = RandomForestClassifier(n_estimators=n)
for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle=True):
# print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")# # print(train_k,test_k)# print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k],pred)/10
scores_n.append(rfc_score)
print(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)
find best n_estimators for RandomForestClassfier...
the number of trees: 1
13.919386979412682
Done processing 1 trees (0.726sec)
the number of trees: 12
13.389844271562911
Done processing 12 trees (6.008sec)
the number of trees: 23
13.180159610972076
Done processing 23 trees (11.043sec)
the number of trees: 34
13.219150771746676
Done processing 34 trees (18.443sec)
the number of trees: 45
13.043065195199537
Done processing 45 trees (22.152sec)
the number of trees: 56
13.057856481381911
Done processing 56 trees (27.636sec)
the number of trees: 67
13.055167436843206
Done processing 67 trees (32.039sec)
the number of trees: 78
13.051153915112355
Done processing 78 trees (38.372sec)
the number of trees: 89
13.048438999530031
Done processing 89 trees (44.605sec)
the number of trees: 100
13.114318256666222
Done processing 100 trees (49.886sec)
45 13.043065195199537
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print("the max depth : {0}".format(m))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
scores_m.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (2.389sec)
the max depth : 10
Done processing 10 trees (6.044sec)
the max depth : 100
Done processing 100 trees (21.773sec)
10 11.06186210729279
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
pred = model.predict(train_kobe)
count = 0for i,j in zip(train_label,pred):
if i == j:
count +=1
print("acc is {0}".format(count/len(train_label)))
acc is 0.6850215978518893
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_kobe,train_label, test_size=0.3, random_state=0)
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(X_train,y_train)
pre = model.predict(X_test)
count = 0for i,j in zip(y_test,pred):
if i == j:
count +=1
print("acc is {0}".format(count/len(pre)))