import time import requests from bs4 import BeautifulSoup from pymongo import MongoClient client=MongoClient() songs=client.kugou_db.songs header={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } def get_info(url): wb_data=requests.get(url,headers=header) soup=BeautifulSoup(wb_data.text,'html.parser',from_encoding='utf-8') #soup.select()方法中对应的参数为:检查--》copy--》copy selector ranks=soup.select('.pc_temp_num') titles=soup.select('.pc_temp_songlist > ul > li > a') song_times=soup.select('.pc_temp_time') for rank,title,song_time in zip(ranks,titles,song_times): data={ 'rank': str(rank.get_text().strip()), 'singer': title.get_text().split('-')[0].strip(), 'song_name': title.get_text().split('-')[1].strip(), 'time':song_time.get_text().strip(), } print(data) #存入数据库 song_id=songs.insert(data) print(song_id) #output_html(data) print('----------------------------') import json #存为Json文件 def output_html(data): jsObj = json.dumps(data,ensure_ascii=False).encode('utf-8') foutput = open('output1.json', 'a') foutput.write(jsObj) foutput.close() if __name__ == '__main__': #range范围可以通过页面数据获取到,此处没有如此做 urls=['http://www.kugou.com/yy/rank/home/{}-8888.html?from=rank'.format(str(i)) for i in range(1,24)] for url in urls: get_info(url) time.sleep(1)
爬取酷狗音乐排行榜单
最新推荐文章于 2024-07-18 20:42:20 发布