import time,json import requests from bs4 import BeautifulSoup class Kugou(object): # 定义一个类(Class),类就是一个模板,模板里可以包含多个函数。 def __init__(self): self.header = { } # 初始化函数,用来定义创建新文件的状态 def getInfo(self, url): html = requests.get(url, headers=self.header) soup = BeautifulSoup(html.text, 'html.parser') ranks = soup.select('.pc_temp_num') titles = soup.select('.pc_temp_songlist > ul > li > a') # 层层标签查找 times = soup.select('.pc_temp_time') for rank, title, songTime in zip(ranks, titles, times): data = {# rank 全打印就是带HTML标签的 'rank': rank.get_text().strip(), 'title': title.get_text().split('-')[1].strip(), 'singer': title.get_text().split('-')[0].strip(), 'songTime': songTime.get_text().strip() } s = str(data) print('rank:%2s\t' % data['rank'], 'title:%2s\t' % data['title'], 'singer:%2s\t' %data['singer'], 'songTime:%2s\t' % data['songTime']) with open('歌曲排行版.txt', 'a', encoding='utf8') as f: f.writelines(s + '\n') #每写一条,换行 if __name__ == '__main__': #当模块被直接运行时,以下代码块将被运行。 urls = [ 'http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(10) ] kugou = Kugou() for url in urls: kugou.getInfo(url) time.sleep(1) #函数推迟调用1秒运行,目的是让执行的过程能看清楚
爬取音乐排行榜,需要自补请求头
于 2024-01-09 14:46:41 首次发布