关于NBA所有数据的爬虫(rvest)

--需要爬虫的原因是网站一次性最多下载150行数据,如果手工下载的话,需要下载复制5190次

--准备工作
library("rvest")
library("xml2")

--单一网页尝试
url <- "http://www.stat-nba.com/query.php?page=0&QueryType=game&GameType=season&order=1&crtcol=pts&PageNum=200&Year0=1985&Month0=1&Day0=1&Year1=2018&Month1=11&Day1=30" --网页地址
web<-read_html(url,encoding = "UTF-8") --读取网页
player_name <- (html_nodes(web,".player_name_out")%>%html_text())[2:151] --节点可以多次尝试,或者了解HTML的基础知识
season <- (html_nodes(web,".season")%>%html_text())[2:151]
is_win <- (html_nodes(web,".wl")%>%html_text())[2:151]
result <- (html_nodes(web,".result_out")%>%html_text())[2:151]
is_first <- as.numeric((html_nodes(web,".gs")%>%html_text())[2:151])
playing_time <- as.numeric((html_nodes(web,".mp")%>%html_text())[2:151])
field_goal_percent <- as.numeric(sub("%","",((html_nodes(web,".fgper")%>%html_text())[2:151])))/100 --百分数识别的文本无法转化,需处理
field_goal <- as.numeric((html_nodes(web,".fg")%>%html_text())[2:151])
field_goal_a <- as.numeric((html_nodes(web,".fga")%>%html_text())[2:151])
three_percent <- as.numeric(sub("%","",((html_nodes(web,".threepper")%>%html_text())[2:151])))/100
three <- as.numeric((html_nodes(web,".threep")%>%html_text())[2:151])
three_a <- as.numeric((html_nodes(web,".threepa")%>%html_text())[2:151])
free_percent <- as.numeric(sub("%","",((html_nodes(web,".ftper")%>%html_text())[2:151])))/100
free <- as.numeric((html_nodes(web,".ft")%>%html_text())[2:151])
free_a <- as.numeric((html_nodes(web,".fta")%>%html_text())[2:151])
rebound <- as.numeric((html_nodes(web,".trb")%>%html_text())[2:151])
rebound_offen <- as.numeric((html_nodes(web,".orb")%>%html_text())[2:151])
rebound_defen <- as.numeric((html_nodes(web,".drb")%>%html_text())[2:151])
assist <- as.numeric((html_nodes(web,".ast")%>%html_text())[2:151])
steal <- as.numeric((html_nodes(web,".stl")%>%html_text())[2:151])
block <- as.numeric((html_nodes(web,".blk")%>%html_text())[2:151])
turnover <- as.numeric((html_nodes(web,".tov")%>%html_text())[2:151])
foul <- as.numeric((html_nodes(web,".pf")%>%html_text())[2:151])
points <- as.numeric((html_nodes(web,".pts")%>%html_text())[2:151])
data1 <- data.frame(player_name,season,is_win,result,is_first,playing_time,field_goal_percent,field_goal,field_goal_a,three_percent,three,three_a,free_percent,free,free_a,rebound,rebound_offen,rebound_defen,assist,steal,block,turnover,foul,points) --合并起来

--循环爬取所有数据   
player_data <- data.frame(player_name=0,season=0,is_win=0,result=0,is_first=0,playing_time=0,field_goal_percent=0,field_goal=0,field_goal_a=0,three_percent=0,three=0,three_a=0,free_percent=0,free=0,free_a=0,rebound=0,rebound_offen=0,rebound_defen=0,assist=0,steal=0,block=0,turnover=0,foul=0,points=0)              
player_data = player_data[-1,]    --以上两句建立一个dim(0,23)的空数据框               
for (i in 0:(ceiling(778447/150)-1)){
url <- paste0("http://www.stat-nba.com/query.php?page=",i,"&QueryType=game&GameType=season&order=1&crtcol=pts&PageNum=200&Year0=1985&Month0=1&Day0=1&Year1=2018&Month1=11&Day1=30") --循环数可以计算
web<-read_html(url,encoding = "UTF-8")
player_name <- (html_nodes(web,".player_name_out")%>%html_text())[2:151]
season <- (html_nodes(web,".season")%>%html_text())[2:151]
is_win <- (html_nodes(web,".wl")%>%html_text())[2:151]
result <- (html_nodes(web,".result_out")%>%html_text())[2:151]
is_first <- as.numeric((html_nodes(web,".gs")%>%html_text())[2:151])
playing_time <- as.numeric((html_nodes(web,".mp")%>%html_text())[2:151])
field_goal_percent <- as.numeric(sub("%","",((html_nodes(web,".fgper")%>%html_text())[2:151])))/100
field_goal <- as.numeric((html_nodes(web,".fg")%>%html_text())[2:151])
field_goal_a <- as.numeric((html_nodes(web,".fga")%>%html_text())[2:151])
three_percent <- as.numeric(sub("%","",((html_nodes(web,".threepper")%>%html_text())[2:151])))/100
three <- as.numeric((html_nodes(web,".threep")%>%html_text())[2:151])
three_a <- as.numeric((html_nodes(web,".threepa")%>%html_text())[2:151])
free_percent <- as.numeric(sub("%","",((html_nodes(web,".ftper")%>%html_text())[2:151])))/100
free <- as.numeric((html_nodes(web,".ft")%>%html_text())[2:151])
free_a <- as.numeric((html_nodes(web,".fta")%>%html_text())[2:151])
rebound <- as.numeric((html_nodes(web,".trb")%>%html_text())[2:151])
rebound_offen <- as.numeric((html_nodes(web,".orb")%>%html_text())[2:151])
rebound_defen <- as.numeric((html_nodes(web,".drb")%>%html_text())[2:151])
assist <- as.numeric((html_nodes(web,".ast")%>%html_text())[2:151])
steal <- as.numeric((html_nodes(web,".stl")%>%html_text())[2:151])
block <- as.numeric((html_nodes(web,".blk")%>%html_text())[2:151])
turnover <- as.numeric((html_nodes(web,".tov")%>%html_text())[2:151])
foul <- as.numeric((html_nodes(web,".pf")%>%html_text())[2:151])
points <- as.numeric((html_nodes(web,".pts")%>%html_text())[2:151])
data <- data.frame(player_name,season,is_win,result,is_first,playing_time,field_goal_percent,field_goal,field_goal_a,three_percent,three,three_a,free_percent,free,free_a,rebound,rebound_offen,rebound_defen,assist,steal,block,turnover,foul,points)
player_data <- rbind(player_data,data)
}


--性能优化分段爬取0-5189  
player_data1 <- data.frame(player_name=0,season=0,is_win=0,result=0,is_first=0,playing_time=0,field_goal_percent=0,field_goal=0,field_goal_a=0,three_percent=0,three=0,three_a=0,free_percent=0,free=0,free_a=0,rebound=0,rebound_offen=0,rebound_defen=0,assist=0,steal=0,block=0,turnover=0,foul=0,points=0)              
player_data1 = player_data1[-1,]            
for (i in 0:1000){
url <- paste0("http://www.stat-nba.com/query.php?page=",i,"&QueryType=game&GameType=season&order=1&crtcol=pts&PageNum=200&Year0=1985&Month0=1&Day0=1&Year1=2018&Month1=11&Day1=30")
web<-read_html(url,encoding = "UTF-8")
player_name <- html_nodes(web,".player_name_out")%>%html_text()
season <- html_nodes(web,".season")%>%html_text()
is_win <- html_nodes(web,".wl")%>%html_text()
result <- html_nodes(web,".result_out")%>%html_text()
is_first <- html_nodes(web,".gs")%>%html_text()
playing_time <- html_nodes(web,".mp")%>%html_text()
field_goal_percent <- html_nodes(web,".fgper")%>%html_text()
field_goal <- html_nodes(web,".fg")%>%html_text()
field_goal_a <- html_nodes(web,".fga")%>%html_text()
three_percent <- html_nodes(web,".threepper")%>%html_text()
three <- html_nodes(web,".threep")%>%html_text()
three_a <- html_nodes(web,".threepa")%>%html_text()
free_percent <- html_nodes(web,".ftper")%>%html_text()
free <- html_nodes(web,".ft")%>%html_text()
free_a <- html_nodes(web,".fta")%>%html_text()
rebound <- html_nodes(web,".trb")%>%html_text()
rebound_offen <- html_nodes(web,".orb")%>%html_text()
rebound_defen <- html_nodes(web,".drb")%>%html_text()
assist <- html_nodes(web,".ast")%>%html_text()
steal <- html_nodes(web,".stl")%>%html_text()
block <- html_nodes(web,".blk")%>%html_text()
turnover <- html_nodes(web,".tov")%>%html_text()
foul <- html_nodes(web,".pf")%>%html_text()
points <- html_nodes(web,".pts")%>%html_text()
data1 <- data.frame(player_name,season,is_win,result,is_first,playing_time,field_goal_percent,field_goal,field_goal_a,three_percent,three,three_a,free_percent,free,free_a,rebound,rebound_offen,rebound_defen,assist,steal,block,turnover,foul,points)
player_data1 <- rbind(player_data1,data1)
print(i)
print(Sys.time()) 
}

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值