预测nba本赛季球队夺冠的胜率并进行相关分析

import numpy as np
import joblib
import pandas as pd     #数据分析库
import os,glob          #os基本的系统控制(读写),glob(正则表达式的文件读取)
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings          #去除警告提示的
from pyecharts.charts import Bar,Grid
from pyecharts import options as opts
warnings.filterwarnings("ignore") #忽略所有的警告提示

#画布选项
plt.rcParams["figure.figsize"] = (10,6) #画布大小
plt.rcParams["font.sans-serif"] = ["SimHei"] #识别中文
plt.rcParams["axes.unicode_minus"] = False #修改为中文字符
def get_data(path):
    os.chdir(path)
    filenames = glob.glob("team_data*.csv")
    i=0
    n=0
    dfs=[]
    for filename in filenames:
        team_win=['达拉斯 小牛','迈阿密 热火',
          '金州 勇士','克利夫兰 骑士',
          '迈阿密 热火','圣安东尼奥 马刺',
          '金州 勇士','金州 勇士',
          '多伦多 猛龙','洛杉矶 湖人','密尔沃基 雄鹿']
        df=pd.read_csv(filename)
        df['冠军']=0
        index = df[df.球队 == team_win[i]].index.tolist()[0]  #获取冠军球队的行索引
        df['冠军'].iloc[index] = 1
        i+=1
        dfs.append(df[index:index+1])
    for filename in filenames:
        team_los=['迈阿密 热火','奥克拉荷马城 雷霆',
          '圣安东尼奥 马刺','迈阿密 热火',
          '克利夫兰 骑士','金州 勇士',
          '克利夫兰 骑士','克利夫兰 骑士',
          '金州 勇士','迈阿密 热火','菲尼克斯 太阳']
        df=pd.read_csv(filename)
        df['冠军']=0
        index = df[df.球队 == team_los[n]].index.tolist()[0]  #获取亚军球队的行索引
        df['冠军'].iloc[index] = 0
        n+=1
        dfs.append(df[index:index+1])

        df = pd.concat(dfs,axis=0,ignore_index=True)
        print(df)
    return(df)
    
def pca_data(data):
    from sklearn.decomposition import PCA
    x=data.drop(data.columns[0],axis=1)
    #x1=x.drop('冠军',axis=1)
    pca =PCA(n_components='mle') #加载PCA算法,设置降维后主成分数目为自动
    reduced_X =pca.fit_transform(x)
    return reduced_X
def fit_data(reduced_X,data):
    from sklearn import metrics   #评价模型的好坏
    from sklearn.linear_model import LogisticRegression #逻辑分布
    from sklearn.naive_bayes import  GaussianNB #贝叶斯 
    from sklearn.ensemble import RandomForestClassifier #树分布
    from sklearn.ensemble import VotingClassifier #框架
    
    x_train=np.array(reduced_X)
    y=data['冠军']
    y_train=np.array(y)
    LR=LogisticRegression(solver='lbfgs',multi_class='multinomial')
    RF=RandomForestClassifier(n_estimators=10)
    GNB=GaussianNB()

    ensemble=VotingClassifier(estimators=[('lr',LR),('rf',RF),('gnb',GNB)], voting='hard')
    ensemble.fit(x_train,y_train)

    LR.fit( x_train, y_train )	
    RF.fit( x_train, y_train )	
    GNB.fit( x_train, y_train )

    print ( "LR - Accuracy (Train):  %.4g" % 	
        metrics.accuracy_score(y_train, LR.predict(x_train)) )
    print ( "RF - Accuracy (Train):  %.4g" % 	
        metrics.accuracy_score(y_train, RF.predict(x_train)) )
    print ( "GNB - Accuracy (Train):  %.4g" % 	
       metrics.accuracy_score(y_train, GNB.predict(x_train)) )
    print ( "ensemble - Accuracy (Train):  %.4g" % 	
        metrics.accuracy_score(y_train, ensemble.predict(x_train)) )
    
    joblib.dump(RF, 'model.pickle')
    
def pred_data(path):
    df=pd.read_csv(path+'本赛季.csv')
    df['冠军']=0
    x_test=pca_data(df)
    RF=joblib.load('model.pickle')
    dfs=[]
    for i in range(100):
        
        x=(RF.predict(x_test))
        index=0
        for i in x:
            if i == 1 :
                dfs.append(df[index:index+1])
            index+=1
        df = pd.concat(dfs,axis=0,ignore_index=True) 
    a = Counter(df['球队'])
    df_new1=pd.DataFrame.from_dict(a.keys())
    df_new2=pd.DataFrame.from_dict(a.values())
    df_new=pd.concat([df_new1,df_new2],axis=1,ignore_index=True)
    
    print(df_new)
    sns.barplot(x=df_new[0],y=df_new[1],data=df_new)
    a = plt.xticks(rotation=90) #x轴刻度旋转90度
    plt.title('夺冠可能性')
    plt.show()
    
def player_data(path):
    os.chdir(path)
    filenames = glob.glob("player*.csv")
    #print(filenames)
    df=pd.read_csv(filenames[0])
    x=(df['得分']+df['篮板']+df['助攻']+df['抢断']+df['盖帽']-df['失误']-df['犯规'])/df['场次']
    df_new=pd.concat([df['球员'],x],axis=1,ignore_index=True)
    df_new = df_new.sort_values(by=1,axis=0,ascending=False)
    df_new=df_new.head(10)
    x=sns.barplot(x=df_new[0], y=df_new[1],data=df_new,palette='husl')
    plt.xticks(rotation=90)
    plt.title('个人贡献值前十球员')
    plt.show()
    print(df_new)
    
    df1=df.drop('球员',axis=1)
    x=df1.sum()
    df1=df1/x
    df=pd.concat([df['球员'],df1],axis=1,ignore_index=False)
    dfs=[]
    for i in range(df_new.shape[0]):
        x=df.loc[df_new.index[i]]
        name=x.loc[['球员']]
        y=x.loc[['得分','防守效率','进攻效率','犯规','出场时间','失误']]
        dfs.append(y)
        labels=np.array(['得分','防守效率','进攻效率','犯规','出场时间','失误'])
        data=np.array(y)
        angles=np.linspace(0, 2*np.pi,len(labels),endpoint=False)
        labels=np.concatenate((labels,[labels[0]]))
        data=np.concatenate((data,[data[0]]))
        angles=np.concatenate((angles,[angles[0]]))
        plt.polar(angles, data,'bo-',linewidth=1)
        plt.thetagrids(angles*180/np.pi,labels)
        plt.fill(angles, data,facecolor='b',alpha=0.25)
        plt.title(str(name))
        plt.show()
    df1 = pd.concat(dfs,axis=1,ignore_index=True)
    x0=np.array(df1[0].values)
    x1=np.array(df1[1].values)
    x2=np.array(df1[2].values)
    x3=np.array(df1[3].values)
    x4=np.array(df1[4].values)
    x5=np.array(df1[5].values)
    x6=np.array(df1[6].values)
    x7=np.array(df1[7].values)
    x8=np.array(df1[8].values)
    x9=np.array(df1[9].values)


    
    bar = (
        Bar()
        .add_xaxis(['得分','防守效率','进攻效率','犯规','出场时间','失误'])
        .add_yaxis(df_new[0].iloc[0], list(x0))
        .add_yaxis(df_new[0].iloc[1], list(x1))
        .add_yaxis(df_new[0].iloc[2], list(x2))
        .add_yaxis(df_new[0].iloc[3], list(x3))
        .add_yaxis(df_new[0].iloc[4], list(x4))
        .add_yaxis(df_new[0].iloc[5], list(x5))
        .add_yaxis(df_new[0].iloc[6], list(x6))
        .add_yaxis(df_new[0].iloc[7], list(x7))
        .add_yaxis(df_new[0].iloc[8], list(x8))
        .add_yaxis(df_new[0].iloc[9], list(x9))
    
        .set_global_opts(datazoom_opts=opts.DataZoomOpts())
    # 或者直接使用字典参数
    )
    
    bar.render()
    sns.scatterplot(x="出场时间", y="失误", data=df)
def Kmean(data):
    from sklearn.cluster import KMeans
    from sklearn import metrics
    data=pca_data(data)
    kmeans_model = KMeans(n_clusters=3, random_state=1).fit(data)
    labels = kmeans_model.labels_
    x=metrics.silhouette_score(data, labels, metric='euclidean')
    print(x)
    
    return labels


def team_data(path):
    df=pd.read_csv(path+'本赛季.csv')
    
    labels=Kmean(df)
    labels=pd.DataFrame(labels)
    labels=labels.rename(columns={0:'球队等级'})
    #print(labels)
    df_new=pd.concat([df['球队'],labels],axis=1,ignore_index=False)
    print(df_new)
    size=df_new.groupby(df_new['球队等级']).球队.count()
    fig = plt.figure(figsize=(10,4))
    ax1 = fig.add_subplot(121)
    ax1.pie(size, labels=size.index, autopct='%.2f%%',startangle=-110,
        colors=sns.color_palette("rainbow",size.shape[0]))


    text_1=['球队等级=1:']
    text_2=['球队等级=2:']
    text_3=['球队等级=3:']
    for i in range(len(df_new)):
        if df_new.loc[i]['球队等级'] == 0:
            text_1.append(df_new.loc[i]['球队'])
        if df_new.loc[i]['球队等级'] == 1:
            text_2.append(df_new.loc[i]['球队'])
        if df_new.loc[i]['球队等级'] == 2:
            text_3.append(df_new.loc[i]['球队'])
    
     
    ax1.text(x=1,y=3,s=text_1)
    ax1.text(x=1,y=2,s=text_2)
    ax1.text(x=1,y=1,s=text_3)
    
    
if __name__ == "__main__":
    path='D:/天下3/job/nba_data/'
    data=get_data(path)  #获取过去十年总决赛球队
    reduced_X=pca_data(data)   #数据降维
    fit_data(reduced_X, data)  #建立模型
    pred_data(path)    #进行预测并可视化
    player_data(path)   #本赛季球员个人贡献率前10
    team_data(path)  #球队实力分析
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值