发现URL规律,构建url,使用BeautifulSoup解析
url = 'https://www.kugou.com/yy/rank/home/1-8888.html?from=rank'
url = 'https://www.kugou.com/yy/rank/home/2-8888.html?from=rank'
爬取数据,最后组成字典
for rank,title,time in zip(ranks,titles,times):
#排行
rank = rank.get_text().strip()
# print(rank)
#歌名、歌手
song = title.get_text().split('-')[-1].strip()
singer = title.get_text().split('-')[0]
# print(song,singer)
#时长
song_time = time.get_text().strip()
# print(song_time)
print(rank,song,singer,song_time)
# 上传数据必须为字典
data = {
'rank':rank,
'song':song,
'singer':singer,
'song_time':song_time
}
代码
import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
#本地连接MongoDB
client = MongoClient()
#连接到数据库KG_DB再到集合songs
songs = client.KG_DB.songs
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
def kg_spider(url):
res = requests.get(url,headers=headers)
html = res.text
# print(html)
soup = BeautifulSoup(html,'lxml')
#通过标签查找需要的数据
ranks = soup.select('.pc_temp_num')
titles = soup.select('.pc_temp_songlist > ul > li > a')
times = soup.select('.pc_temp_time')
for rank,title,time in zip(ranks,titles,times):
#排行
rank = rank.get_text().strip()
# print(rank)
#歌名、歌手
song = title.get_text().split('-')[-1].strip()
singer = title.get_text().split('-')[0]
# print(song,singer)
#时长
song_time = time.get_text().strip()
# print(song_time)
print(rank,song,singer,song_time)
# 上传数据必须为字典
data = {
'rank':rank,
'song':song,
'singer':singer,
'song_time':song_time
}
# 数据存储
songs_id = songs.insert(data)
print(songs_id)
if __name__ == '__main__':
# 利用[]构建链接列表
urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html?from=rank'.format(str(i)) for i in range(1,22)]
for url in urls:
data = kg_spider(url)
time.sleep(1)