思路
页面准备
library(RSelenium)
library(rvest)
base <- "http://music.163.com"
url <- "http://music.163.com/#/djradio?id=169"
remDr <- remoteDriver(browserName = "chrome")
shell("java -jar D:/R/library/Rwebdriver/selenium-server-standalone-3.7.1.jar",
wait = FALSE, invisible = FALSE)
获取节目链接
# Step1:封装函数ProgramlinkFunc,并执行,获得节目期数链接programlink(character形式)
ProgramlinkFunc <- function(remDr, url) {
result <- data.frame()
remDr$open()
remDr$navigate(url)
webElem <- remDr$findElements("css", "iframe")
remDr$switchToFrame(webElem[[1]])
j = 0
while (TRUE) {
j = j + 1
destination <- remDr$getPageSource()[[1]] %>% read_html()
totalpage <- destination %>% html_nodes(".u-page a:nth-last-child(2)") %>%
html_text()
curpage <- destination %>% html_nodes(".u-page .js-selected") %>%
html_text()
programlink <- destination %>% html_nodes(".col2 .tt a") %>%
html_attr("href") %>% paste(base, ., sep = "")
data <- data.frame(programlink, stringsAsFactors = FALSE)
result <- rbind(result, data)
if (curpage != totalpage) {
cat(sprintf("第【%d】页节目期数链接抓取成功", j), sep = "\n")
remDr$executeScript("arguments[0].click();",
list(remDr$findElement("css", ".u-page .js-selected+a")))
} else {
cat(sprintf("第【%d】页节目期数链接抓取成功", j), sep = "\n")
break
}
}
cat("All work is done!", sep = "\n")
return(result)
}
programlink <- ProgramlinkFunc(remDr, url) %>% unlist()
# programlink包含229个观测值(共229期节目的链接)
获取歌曲链接
# Step2:封装函数MusiclinkFunc,并执行,获得每一期节目的歌曲链接musiclink(character形式)
MusiclinkFunc <- function(link) {
result <- data.frame()
for (i in seq_along(link)) {
remDr$navigate(link[i])
webElem <- remDr$findElements("css", "iframe")
remDr$switchToFrame(webElem[[1]])
destination <- remDr$getPageSource()[[1]] %>% read_html()
num <- destination %>% html_nodes(".tit .f-ff2") %>% html_text()
musiclink <- destination %>% html_nodes("span.txt a") %>%
html_attr("href") %>% paste(base, ., sep = "")
data <- data.frame(num, musiclink, stringsAsFactors = FALSE)
result <- rbind(result, data)
cat(sprintf("%s歌曲链接抓取成功", num), sep = "\n")
}
cat("All work is done!", sep = "\n")
return(result)
}
musicinfo <- MusiclinkFunc(programlink)
# musicinfo(data.frame形式)包含节目期数num和歌曲链接musiclink,共2063个观测值
musiclink <- musicinfo %>% .$musiclink %>% unique()
# 从musicinfo中提取出每首歌的链接musiclink
# 用unique()去重,剩下417个观测值(歌曲有重复)
获取歌词
# Step3:封装函数LyricFunc,获取歌词lyric
LyricFunc <- function(musiclink) {
remDr$navigate(musiclink)
webElem <- remDr$findElements("css", "iframe")
remDr$switchToFrame(webElem[[1]])
remDr$executeScript("arguments[0].click();",
list(remDr$findElement("css", "a#flag_ctrl")))
# 点击歌曲页面的“展开”按钮
destination <- remDr$getPageSource()[[1]] %>% read_html()
songname <- destination %>% html_nodes(".tit em") %>% html_text()
lyric <- destination %>% html_nodes("#lyric-content") %>% html_text()
data <- data.frame(musiclink, songname, lyric, stringsAsFactors = FALSE)
return(data)
}
异常处理
# Step4:执行函数LyricFunc,for循环和tryCatch函数捕获异常
lyricinfo <- list()
for (i in seq_along(musiclink)) {
if (!(musiclink[i] %in% names(lyricinfo))) {
cat(paste("Doing", i, musiclink[i], "..."))
ok <- FALSE
counter <- 0
while (ok == FALSE & counter < 3) {
counter <- counter + 1
output <- tryCatch({
LyricFunc(musiclink[i])
},
error=function(e){
Sys.sleep(2)
e
}
)
if ("error" %in% class(output)) {
cat("NA...")
} else {
ok <- TRUE
cat("Done.")
}
}
cat("\n")
lyricinfo[[i]] <- output
names(lyricinfo)[[i]] <- musiclink[i]
}
}
remDr$close()
cat("All work is done!", sep = "\n")
# 这一步收集到的lyricinfo(list形式)共有417个观测值
# 其中包括暂无歌词的页面返回的错误信息,也包括目标数据,需要进一步将二者分离
数据分离和提取
lyric <- lapply(lyricinfo, function(x) {
if (unlist(x) %>% length() == 3) {
return(x)
} else {
return(NULL)
}
}) %>% .[!sapply(lyricinfo, is.null)] %>% do.call(rbind, .)
# 将lyricinfo中的向量逐个展开,由于目标数据包含3个变量(musiclink、songname、lyric)
# 因此目标向量展开后,长度应该等于3,利用此特性留下目标向量,将非目标向量值设为NULL
# 移除NULL值,作rbind操作,得到lyric(data.frame形式),共有397个观测值
# 说明剔除重复歌曲、暂无歌词的歌曲以后,仅有397首歌可用
View(lyric)
write.table(lyric, row.names = FALSE, sep = ",", "lyric.csv")
# View()函数查看数据并导出到本地
查看数据
总结
- 网易云音乐界面的源代码几乎只有一个
<iframe>
,并且其src是空白的:about:blank
,当用浏览器模拟页面访问时,整个页面的框架已经被异步塞进<iframe>
中,因此必须先将网页元素定位至<ifame>
,再做抓取 :
...
remDr$open()
remDr$navigate(url)
webElem <- remDr$findElements("css", "iframe")
remDr$switchToFrame(webElem[[1]])
destination <- remDr$getPageSource()[[1]] %>% read_html()
...
- 异常处理中的
tryCatch
函数主要是用来剔除无歌词的歌曲,其实现的功能为:
- 对于异常的musiclink[i],执行while循环尝试重新抓取,共尝试3次,每次等待2秒
- 对于未提供歌词的界面,selenium无法定位到“展开”按钮,故会产生如下3次报错并跳过此页:
Doing 40 http://music.163.com/song?id=504264919 ...
Selenium message:no such element: Unable to locate element: {"method":"css selector","selector":"a#flag_ctrl"}
(Session info: chrome=63.0.3239.84)
(Driver info: chromedriver=2.33.506120 (e3e53437346286c0bc2d2dc9aa4915ba81d9023f),platform=Windows NT 10.0.15063 x86_64)
NA...
Selenium message:no such element: Unable to locate element: {"method":"css selector","selector":"a#flag_ctrl"}
(Session info: chrome=63.0.3239.84)
(Driver info: chromedriver=2.33.506120 (e3e53437346286c0bc2d2dc9aa4915ba81d9023f),platform=Windows NT 10.0.15063 x86_64)
NA...
Selenium message:no such element: Unable to locate element: {"method":"css selector","selector":"a#flag_ctrl"}
(Session info: chrome=63.0.3239.84)
(Driver info: chromedriver=2.33.506120 (e3e53437346286c0bc2d2dc9aa4915ba81d9023f),platform=Windows NT 10.0.15063 x86_64)
NA...
-
数据结果:在229(programlink)期节目中抓取到2063(musicinfo)首歌,这2063首歌中有相当一部分是重复的,去重后只剩余417(musiclink/lyricinfo)首歌,再剔除这417首歌中没有提供歌词的歌曲,剩余397(lyric)首含有完整歌词的歌曲。
-
进一步地,还可对数据做一些简单的探讨:
# 找出暂无歌词的歌曲链接(有417-397=20首)
> all <- musiclink %>% unlist() # 共417首
> with <- lyric$musiclink %>% unlist() # 397首有歌词
> without <- setdiff(all, with) # 20首无歌词
[1] "http://music.163.com/song?id=504264919" "http://music.163.com/song?id=481537919"
[3] "http://music.163.com/song?id=453580309" "http://music.163.com/song?id=430685305"
[5] "http://music.163.com/song?id=417594878" "http://music.163.com/song?id=34775098"
[7] "http://music.163.com/song?id=30706019" "http://music.163.com/song?id=29792690"
[9] "http://music.163.com/song?id=29795592" "http://music.163.com/song?id=29722431"
[11] "http://music.163.com/song?id=29450853" "http://music.163.com/song?id=29129764"
[13] "http://music.163.com" "http://music.163.com/song?id=28912429"
[15] "http://music.163.com/song?id=28695504" "http://music.163.com/song?id=28234998"
[17] "http://music.163.com/song?id=27843919" "http://music.163.com/song?id=28152047"
[19] "http://music.163.com/song?id=28182131" "http://music.163.com/song?id=5261818"
# 上述链接中有一个"http://music.163.com",它不是暂无歌词的歌曲链接
# 而是在Step2中,用函数MusiclinkFunc抓各期的歌曲链接时,有些期数无歌曲信息,paste将base和空值合并所得
> library(dplyr)
> filter(musicinfo, musiclink == "http://music.163.com")
# 用dyplr包中的filter()查看无歌曲信息的期数
num musiclink
1 《中国嘻哈榜》第229期 http://music.163.com
2 《中国嘻哈榜》第213期 Vol.52 http://music.163.com
3 《中国嘻哈榜》第187期 http://music.163.com
4 《中国嘻哈榜》第186期 http://music.163.com
5 《中国嘻哈榜》第184期 http://music.163.com
6 《中国嘻哈榜》第183期 http://music.163.com
7 《中国嘻哈榜》第175期 http://music.163.com
8 《中国嘻哈榜》第162期 http://music.163.com
参考资料:
反击爬虫,前端工程师的脑洞可以有多大?
中国有嘻哈:网易云、虾米音乐歌词爬虫项目分享
Click on cross domain iframe element using Rselenium