import numpy as np
import joblib
import pandas as pd #数据分析库
import os,glob #os基本的系统控制(读写),glob(正则表达式的文件读取)
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings #去除警告提示的
from pyecharts.charts import Bar,Grid
from pyecharts import options as opts
warnings.filterwarnings("ignore") #忽略所有的警告提示
#画布选项
plt.rcParams["figure.figsize"] = (10,6) #画布大小
plt.rcParams["font.sans-serif"] = ["SimHei"] #识别中文
plt.rcParams["axes.unicode_minus"] = False #修改为中文字符
def get_data(path):
os.chdir(path)
filenames = glob.glob("team_data*.csv")
i=0
n=0
dfs=[]
for filename in filenames:
team_win=['达拉斯 小牛','迈阿密 热火',
'金州 勇士','克利夫兰 骑士',
'迈阿密 热火','圣安东尼奥 马刺',
'金州 勇士','金州 勇士',
'多伦多 猛龙','洛杉矶 湖人','密尔沃基 雄鹿']
df=pd.read_csv(filename)
df['冠军']=0
index = df[df.球队 == team_win[i]].index.tolist()[0] #获取冠军球队的行索引
df['冠军'].iloc[index] = 1
i+=1
dfs.append(df[index:index+1])
for filename in filenames:
team_los=['迈阿密 热火','奥克拉荷马城 雷霆',
'圣安东尼奥 马刺','迈阿密 热火',
'克利夫兰 骑士','金州 勇士',
'克利夫兰 骑士','克利夫兰 骑士',
'金州 勇士','迈阿密 热火','菲尼克斯 太阳']
df=pd.read_csv(filename)
df['冠军']=0
index = df[df.球队 == team_los[n]].index.tolist()[0] #获取亚军球队的行索引
df['冠军'].iloc[index] = 0
n+=1
dfs.append(df[index:index+1])
df = pd.concat(dfs,axis=0,ignore_index=True)
print(df)
return(df)
def pca_data(data):
from sklearn.decomposition import PCA
x=data.drop(data.columns[0],axis=1)
#x1=x.drop('冠军',axis=1)
pca =PCA(n_components='mle') #加载PCA算法,设置降维后主成分数目为自动
reduced_X =pca.fit_transform(x)
return reduced_X
def fit_data(reduced_X,data):
from sklearn import metrics #评价模型的好坏
from sklearn.linear_model import LogisticRegression #逻辑分布
from sklearn.naive_bayes import GaussianNB #贝叶斯
from sklearn.ensemble import RandomForestClassifier #树分布
from sklearn.ensemble import VotingClassifier #框架
x_train=np.array(reduced_X)
y=data['冠军']
y_train=np.array(y)
LR=LogisticRegression(solver='lbfgs',multi_class='multinomial')
RF=RandomForestClassifier(n_estimators=10)
GNB=GaussianNB()
ensemble=VotingClassifier(estimators=[('lr',LR),('rf',RF),('gnb',GNB)], voting='hard')
ensemble.fit(x_train,y_train)
LR.fit( x_train, y_train )
RF.fit( x_train, y_train )
GNB.fit( x_train, y_train )
print ( "LR - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, LR.predict(x_train)) )
print ( "RF - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, RF.predict(x_train)) )
print ( "GNB - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, GNB.predict(x_train)) )
print ( "ensemble - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, ensemble.predict(x_train)) )
joblib.dump(RF, 'model.pickle')
def pred_data(path):
df=pd.read_csv(path+'本赛季.csv')
df['冠军']=0
x_test=pca_data(df)
RF=joblib.load('model.pickle')
dfs=[]
for i in range(100):
x=(RF.predict(x_test))
index=0
for i in x:
if i == 1 :
dfs.append(df[index:index+1])
index+=1
df = pd.concat(dfs,axis=0,ignore_index=True)
a = Counter(df['球队'])
df_new1=pd.DataFrame.from_dict(a.keys())
df_new2=pd.DataFrame.from_dict(a.values())
df_new=pd.concat([df_new1,df_new2],axis=1,ignore_index=True)
print(df_new)
sns.barplot(x=df_new[0],y=df_new[1],data=df_new)
a = plt.xticks(rotation=90) #x轴刻度旋转90度
plt.title('夺冠可能性')
plt.show()
def player_data(path):
os.chdir(path)
filenames = glob.glob("player*.csv")
#print(filenames)
df=pd.read_csv(filenames[0])
x=(df['得分']+df['篮板']+df['助攻']+df['抢断']+df['盖帽']-df['失误']-df['犯规'])/df['场次']
df_new=pd.concat([df['球员'],x],axis=1,ignore_index=True)
df_new = df_new.sort_values(by=1,axis=0,ascending=False)
df_new=df_new.head(10)
x=sns.barplot(x=df_new[0], y=df_new[1],data=df_new,palette='husl')
plt.xticks(rotation=90)
plt.title('个人贡献值前十球员')
plt.show()
print(df_new)
df1=df.drop('球员',axis=1)
x=df1.sum()
df1=df1/x
df=pd.concat([df['球员'],df1],axis=1,ignore_index=False)
dfs=[]
for i in range(df_new.shape[0]):
x=df.loc[df_new.index[i]]
name=x.loc[['球员']]
y=x.loc[['得分','防守效率','进攻效率','犯规','出场时间','失误']]
dfs.append(y)
labels=np.array(['得分','防守效率','进攻效率','犯规','出场时间','失误'])
data=np.array(y)
angles=np.linspace(0, 2*np.pi,len(labels),endpoint=False)
labels=np.concatenate((labels,[labels[0]]))
data=np.concatenate((data,[data[0]]))
angles=np.concatenate((angles,[angles[0]]))
plt.polar(angles, data,'bo-',linewidth=1)
plt.thetagrids(angles*180/np.pi,labels)
plt.fill(angles, data,facecolor='b',alpha=0.25)
plt.title(str(name))
plt.show()
df1 = pd.concat(dfs,axis=1,ignore_index=True)
x0=np.array(df1[0].values)
x1=np.array(df1[1].values)
x2=np.array(df1[2].values)
x3=np.array(df1[3].values)
x4=np.array(df1[4].values)
x5=np.array(df1[5].values)
x6=np.array(df1[6].values)
x7=np.array(df1[7].values)
x8=np.array(df1[8].values)
x9=np.array(df1[9].values)
bar = (
Bar()
.add_xaxis(['得分','防守效率','进攻效率','犯规','出场时间','失误'])
.add_yaxis(df_new[0].iloc[0], list(x0))
.add_yaxis(df_new[0].iloc[1], list(x1))
.add_yaxis(df_new[0].iloc[2], list(x2))
.add_yaxis(df_new[0].iloc[3], list(x3))
.add_yaxis(df_new[0].iloc[4], list(x4))
.add_yaxis(df_new[0].iloc[5], list(x5))
.add_yaxis(df_new[0].iloc[6], list(x6))
.add_yaxis(df_new[0].iloc[7], list(x7))
.add_yaxis(df_new[0].iloc[8], list(x8))
.add_yaxis(df_new[0].iloc[9], list(x9))
.set_global_opts(datazoom_opts=opts.DataZoomOpts())
# 或者直接使用字典参数
)
bar.render()
sns.scatterplot(x="出场时间", y="失误", data=df)
def Kmean(data):
from sklearn.cluster import KMeans
from sklearn import metrics
data=pca_data(data)
kmeans_model = KMeans(n_clusters=3, random_state=1).fit(data)
labels = kmeans_model.labels_
x=metrics.silhouette_score(data, labels, metric='euclidean')
print(x)
return labels
def team_data(path):
df=pd.read_csv(path+'本赛季.csv')
labels=Kmean(df)
labels=pd.DataFrame(labels)
labels=labels.rename(columns={0:'球队等级'})
#print(labels)
df_new=pd.concat([df['球队'],labels],axis=1,ignore_index=False)
print(df_new)
size=df_new.groupby(df_new['球队等级']).球队.count()
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.pie(size, labels=size.index, autopct='%.2f%%',startangle=-110,
colors=sns.color_palette("rainbow",size.shape[0]))
text_1=['球队等级=1:']
text_2=['球队等级=2:']
text_3=['球队等级=3:']
for i in range(len(df_new)):
if df_new.loc[i]['球队等级'] == 0:
text_1.append(df_new.loc[i]['球队'])
if df_new.loc[i]['球队等级'] == 1:
text_2.append(df_new.loc[i]['球队'])
if df_new.loc[i]['球队等级'] == 2:
text_3.append(df_new.loc[i]['球队'])
ax1.text(x=1,y=3,s=text_1)
ax1.text(x=1,y=2,s=text_2)
ax1.text(x=1,y=1,s=text_3)
if __name__ == "__main__":
path='D:/天下3/job/nba_data/'
data=get_data(path) #获取过去十年总决赛球队
reduced_X=pca_data(data) #数据降维
fit_data(reduced_X, data) #建立模型
pred_data(path) #进行预测并可视化
player_data(path) #本赛季球员个人贡献率前10
team_data(path) #球队实力分析
预测nba本赛季球队夺冠的胜率并进行相关分析
最新推荐文章于 2025-04-23 20:11:06 发布