【CSS Selector】酷狗Top500（Python & R）

最新推荐文章于 2023-11-14 23:16:16 发布

Joyliness

最新推荐文章于 2023-11-14 23:16:16 发布

阅读量781

点赞数

本文链接：https://blog.csdn.net/Joyliness/article/details/79106107

版权

Python

# 加载模块
import pandas
import requests
from bs4 import BeautifulSoup

# 伪装报头
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
           AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
          }

# 定义get_info，获取排名情况、歌手、歌名、歌曲链接、歌曲时长
def get_info(url):
    destination = requests.get(url, headers = headers)
    soup   = BeautifulSoup(destination.text, 'lxml')
    ranks  = soup.select('span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist ul li > a')
    times  = soup.select('span.pc_temp_time')
    total  = []
    for rank, title, time in zip(ranks, titles, times):
        data = { 'rank' : rank.get_text().strip(),
                 'singer' : title.get_text().split('-')[0],
                 'song' : title.get_text().split('-')[1],
                 'link' : title.get('href'),
                 'time' : time.get_text().strip()
               }
        total.append(data)
    return total
    
# 程序入口 
result = []     
for i in range(1, 24):
    url = 'http://www.kugou.com/yy/rank/home/{}-8888.html?from=rank'.format(i)
    result.append(get_info(url))
    print(result)

# 导出数据
deal1 = list(map(pandas.DataFrame, result)) # 有序排列
deal2 = pandas.concat(deal1) # 合并每一页的内容，只保留一个总表头
deal2.to_csv('kugouTop500.csv')

参考资料：
爬取酷狗top500
从零开始学Python网络爬虫

R

# 加载包
library(rvest)
library(stringr)

# 定义GetinfoFunc，获取排名情况、歌手、歌名、歌曲链接、歌曲时长
GetinfoFunc <- function(url) {
  result <- data.frame()
  for (i in seq_along(url)) {
    destination <- read_html(url[i], encoding = 'UTF-8')
    rank   <- destination %>% html_nodes('span.pc_temp_num') %>% html_text() %>% str_trim()
    title  <- destination %>% html_nodes('div.pc_temp_songlist ul li > a') %>% html_text()
    singer <- title %>% strsplit(., '-') %>% lapply(., function(x){x[1]}) %>% unlist()
    song   <- title %>% strsplit(., '-') %>% lapply(., function(x){x[2]}) %>% unlist()
    link   <- destination %>% html_nodes('div.pc_temp_songlist ul li > a') %>% html_attr('href')
    time   <- destination %>% html_nodes('span.pc_temp_time') %>% html_text() %>% str_trim()
    data   <- data.frame(rank, singer, song, link, time)
    result <- rbind(result, data)
    cat(sprintf('第【%d】页歌曲抓取成功', i), sep = '\n')
  }
  return(result)
} 

# 执行函数
base1 <- 'http://www.kugou.com/yy/rank/home/'
base2 <- '-8888.html'
url   <- paste0(base1, 1:23, base2)
kugou <- GetinfoFunc(url)

# 导出csv文件
write.table(kugou, row.names = FALSE, sep = ',', 'kugouTop500.csv')