数据挖掘决策树python_Python数据挖掘入门与实践---用决策树预测获胜球队

参考书籍:《Python数据挖掘入门与实践》

1.加载数据集:

使用pandas加载数据集,有1319行数据, 8个特征, 查看前5项数据集,并查找是否有重复数据

#coding=gbk

#使用决策树来预测获胜球队

import time

start = time.clock()

#加载数据集

import pandas as pd

file_name = r'D:\datasets\NBA_2014_games.csv'

data = pd.read_csv(file_name)

print(data.head()) #读取前5项数据集

# Date Unnamed: 1 Visitor/Neutral PTS Home/Neutral \.....

# 0 Tue Oct 29 2013 Box Score Orlando Magic 87 Indiana Pacers

# 1 Tue Oct 29 2013 Box Score Los Angeles Clippers 103 Los Angeles Lakers

# 2 Tue Oct 29 2013 Box Score Chicago Bulls 95 Miami Heat

# 3 Wed Oct 30 2013 Box Score Brooklyn Nets 94 Cleveland Cavaliers

# 4 Wed Oct 30 2013 Box Score Atlanta Hawks 109 Dallas Mavericks

print(data.shape) # (1319, 8)

print(data[data.duplicated()]) # Empty DataFrame 没有重复元素

数据集清洗:1.第一列数据日期是字符串格式,改为日期格式; 2.修改表头。

#修复表头数据参数

data = pd.read_csv(file_name, parse_dates= ['Date']) #skiprows 忽略的行数

data.columns = ['Date','Score Type', 'Visitor Team', 'VisitorPts', 'Home Team', 'HomePts', 'OT?', 'Notes']

print(data.head()) #重命名表头

# Date Score Type Visitor Team VisitorPts \。。。。

# 0 2013-10-29 Box Score Orlando Magic 87

# 1 2013-10-29 Box Score Los Angeles Clippers 103

# 2 2013-10-29 Box Score Chicago Bulls 95

# 3 2013-10-30 Box Score Brooklyn Nets 94

# 4 2013-10-30 Box Score Atlanta Hawks 109

print('-----')

# print(data.ix[1] ) #打印出第2行的数据

提取新特征:通过现有的数据抽取特征, 首先确定类别,篮球只有胜负之分, 不像足球还有 平,局,  以1 代表球队取胜,0为失败。

#提取新特征

#找出获胜的球队

data['HomeWin'] = data['VisitorPts'] < data['HomePts']

y_true = data['HomeWin'].values

print(y_true[:5]) #[ True True True True True] 是 numpy 数组

# print(data.head())

#创建2个新特征, 分别是这两只球队的上一场比赛的胜负情况

#创建字典,存放上次比赛结果

from collections import defaultdict

won_last = defaultdict(int)

data['HomeLastWin'] = None

data['VisitorLastWin'] = None #此两行代码原书上没有,应该增加这2列,否则下面的循环不能创建这2列

for index, row in data.iterrows():

home_team = row['Home Team']

visitor_team = row['Visitor Team'] #循环获得球队名称

row['HomeLastWin'] = won_last[home_team]

row['VisitorLastWin'] = won_last[visitor_team]

data.ix[index] = row #更新行数

won_last[home_team] = row['HomeWin'] #判断上一场是否获胜

won_last[visitor_team] =not row['HomeWin']

print('----')

# print(data.ix[20:25])

# Home Team HomePts OT? Notes HomeWin HomeLastWin VisitorLastWin

# 20 Boston Celtics 98 NaN NaN False False False

# 21 Brooklyn Nets 101 NaN NaN True False False

# 22 Charlotte Bobcats 90 NaN NaN True False True

# 23 Denver Nuggets 98 NaN NaN False False False

# 24 Houston Rockets 113 NaN NaN True True True

# 25 Los Angeles Lakers 85 NaN NaN False False True

一些练习测试代码:defaultdict 和 iterrows()的使用方法

won_last['jj'] = 12

dd = won_last['Indiana Pacers'] #defaultdict的作用是在于,当字典里的key不存在但被查找时,返回的不是keyError而是一个默认值

print(dd) # 0

print(won_last) # defaultdict(, {'Indiana Pacers': 0, 'jj': 12}) 返回的是defaultdict类型

dataset = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])

print(dataset)

for index, row in dataset.iterrows():

print(index) # 0, 1, 2 打印出行号

print(row) #打印出第 1, 2, 3 行的全部元素

2.使用决策树

这里直接使用决策树, 没有刻意地去调参数,可能是作者为了对比不同特征的优劣吧。

从数据集中构建有效的特征, (Feature Engineering 特征工程)是数据挖掘的难点所在, 好的特征直接关系到结果的正确率, -------甚至比选择合适的算法更重要。

#使用决策树

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state =14) #设置随机种子,使结果复现,。。。 但是还是不同。

X_previousWins = data[['HomeLastWin', 'VisitorLastWin']].values #使用新创建的2个特征作为输入

from sklearn.model_selection import cross_val_score # 使 用交叉验证模型平均得分

import numpy as np

scores = cross_val_score(clf, X_previousWins, y_true, scoring='accuracy')

mean_score = np.mean(scores) *100

print('the accuracy is %0.2f'%mean_score+'%') #准确率为 the accuracy is 57.47%

使用另一数据集:13年NBA 排名情况

#读取2013年球队排名情况

file_name2 = r'D:\datasets\NBA_2013_stangdings.csv'

standings = pd.read_csv(file_name2)

# print(standings.head())

# Rk Team Overall Home Road E W A C \....

# 0 1 Miami Heat 66-16 37-4 29-12 41-11 25-5 14-4 12-6

# 1 2 Oklahoma City Thunder 60-22 34-7 26-15 21-9 39-13 7-3 8-2

# 2 3 San Antonio Spurs 58-24 35-6 23-18 25-5 33-19 8-2 9-1

# 3 4 Denver Nuggets 57-25 38-3 19-22 19-11 38-14 5-5 10-0

# 4 5 Los Angeles Clippers 56-26 32-9 24-17 21-9 35-17 7-3 8-2

# print(standings.shape) # (30, 24) 有30只球队

创建一个新特征值, 主场球队是否比对手排名高。然后使用创建的3个特征去 fit 模型

#创建一个新特征值, 主场球队是否比对手排名高

data['HomeTeamRanksHigher'] = 0

for index, row in data.iterrows():

home_team = row['Home Team']

visitor_team = row['Visitor Team']

if home_team =='New Orleans Pelicans': #更换了名字的球队

home_team ='New Orleans Hornets'

elif visitor_team == 'New Orleans Pelicans':

visitor_team='New Orleans Hornets'

#比较排名, 更新特征值

home_rank = standings[standings['Team']== home_team]['Rk'].values[0]

visitor_rank = standings[standings['Team']== visitor_team]['Rk'].values[0]

row['HomeTeamRanksHigher'] = int(home_rank > visitor_rank)

data.ix[index] = row

X_homehigher = data[['HomeLastWin', 'VisitorLastWin', 'HomeTeamRanksHigher']].values

# clf1 = DecisionTreeClassifier(random_state=14)

# scores = cross_val_score(clf1, X_homehigher, y_true, scoring='accuracy')

# mean_score1 = np.mean(scores) *100

# print('the new accuracy is %.2f'%mean_score1 + '%') #the new accuracy is 59.67%

再创建新特征, 对比比赛的2队上一场2队比赛的结果

#再创建新特征, 对比比赛的2队上一场2队比赛的结果

last_match_winner = defaultdict(int)

data['HomeTeamWonLast'] = 0

for index, row in data.iterrows():

home_team = row['Home Team']

visitor_team = row['Visitor Team']

teams = tuple(sorted([home_team, visitor_team]))

row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == row['Home Team'] else 0

data.ix[index] = row

winner = row['Home Team'] if row['HomeWin'] else row['Visitor Team']

last_match_winner[teams] = winner

X_lastwinner = data[['HomeTeamWonLast', 'HomeTeamRanksHigher']]

# clf2 = DecisionTreeClassifier(random_state=14)

# scores = cross_val_score(clf2, X_lastwinner, y_true, scoring='accuracy')

# mean_score2 = np.mean(scores) *100

# print('the accuracy is %.2f'%mean_score2 + '%') # the accuracy is 57.85%

观察决策树在训练数据量很大的情况下, 能否得到有效的模型,使用球队,并对其编码

#使用LabelEncoder 转换器把字符串类型的队名转换成整型

from sklearn.preprocessing import LabelEncoder

encoding = LabelEncoder()

encoding.fit(data['Home Team'].values) #将主队名称转换成整型

home_teams = encoding.transform(data['Home Team'].values)

visitor_teams = encoding.transform(data['Visitor Team'].values)

X_teams = np.vstack([home_teams, visitor_teams]).T

from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder()

X_teams_expanded = onehot.fit_transform(X_teams).todense()

clf3 = DecisionTreeClassifier(random_state=14)

# scores = cross_val_score(clf3, X_teams_expanded, y_true, scoring='accuracy')

# mean_score3 = np.mean(scores) *100

# print('the accuracy is %.2f'%mean_score3+'%') # the accuracy is 59.52%

3.使用随机森林

print('----rf-----')

#使用随机森林进行预测

from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier(random_state = 14, n_jobs =-1) #最好调下决策树的参数

# rf_scores = cross_val_score(rf, X_teams, y_true, scoring='accuracy')

# mean_rf_score = np.mean(rf_scores) *100

# print('the randforestclassifier accuracy is %.2f'%mean_rf_score+'%') #the randforestclassifier accuracy is 58.38%

#多使用几个特征

print('使用多个参数')

X_all = np.hstack([X_homehigher, X_teams])

# rf_clf2 = RandomForestClassifier(random_state = 14, n_jobs=-1)

# rf_scores2 = cross_val_score(rf_clf2, X_all, y_true, scoring='accuracy')

# mean_rf_score2 = np.mean(rf_scores2) *100

# print('the accuracy is %.2f'%mean_rf_score2+'%') # the accuracy is 57.62%

使用网格搜索查找最佳的模型, 并查看使用的参数。

#调参数, 使用网格搜索

from sklearn.model_selection import GridSearchCV

param_grid = {

'max_features':[2,3,'auto'],

'n_estimators': [100,110,120 ],

'criterion': ['gini', 'entropy'],

"min_samples_leaf": [2, 4, 6]

}

clf = RandomForestClassifier(random_state=14, n_jobs=-1)

grid = GridSearchCV(clf, param_grid)

grid.fit(X_all, y_true)

score = grid.best_score_ *100

print('the accuracy is %.2f'%score +'%') #the accuracy is 62.02%

something= str(grid.best_estimator_)

print(something) #输出网格搜索找到的最佳模型

print(grid.best_params_) #输出返回最好的参数

# the accuracy is 62.02%

# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',

# max_depth=None, max_features=3, max_leaf_nodes=None,

# min_impurity_decrease=0.0, min_impurity_split=None,

# min_samples_leaf=2, min_samples_split=2,

# min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,

# oob_score=False, random_state=14, verbose=0, warm_start=False)

# {'n_estimators': 100, 'criterion': 'entropy', 'max_features': 3, 'min_samples_leaf': 2}

# 所花费的时间 : 117.93s

end = time.clock()

time = end - start

print('所花费的时间 : %.2f'%time + 's')

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值